]> Git Repo - linux.git/blob - fs/erofs/zdata.c
Linux 6.14-rc3
[linux.git] / fs / erofs / zdata.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2018 HUAWEI, Inc.
4  *             https://www.huawei.com/
5  * Copyright (C) 2022 Alibaba Cloud
6  */
7 #include "compress.h"
8 #include <linux/psi.h>
9 #include <linux/cpuhotplug.h>
10 #include <trace/events/erofs.h>
11
12 #define Z_EROFS_PCLUSTER_MAX_PAGES      (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
13 #define Z_EROFS_INLINE_BVECS            2
14
15 struct z_erofs_bvec {
16         struct page *page;
17         int offset;
18         unsigned int end;
19 };
20
21 #define __Z_EROFS_BVSET(name, total) \
22 struct name { \
23         /* point to the next page which contains the following bvecs */ \
24         struct page *nextpage; \
25         struct z_erofs_bvec bvec[total]; \
26 }
27 __Z_EROFS_BVSET(z_erofs_bvset,);
28 __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
29
30 /*
31  * Structure fields follow one of the following exclusion rules.
32  *
33  * I: Modifiable by initialization/destruction paths and read-only
34  *    for everyone else;
35  *
36  * L: Field should be protected by the pcluster lock;
37  *
38  * A: Field should be accessed / updated in atomic for parallelized code.
39  */
40 struct z_erofs_pcluster {
41         struct mutex lock;
42         struct lockref lockref;
43
44         /* A: point to next chained pcluster or TAILs */
45         struct z_erofs_pcluster *next;
46
47         /* I: start block address of this pcluster */
48         erofs_off_t index;
49
50         /* L: the maximum decompression size of this round */
51         unsigned int length;
52
53         /* L: total number of bvecs */
54         unsigned int vcnt;
55
56         /* I: pcluster size (compressed size) in bytes */
57         unsigned int pclustersize;
58
59         /* I: page offset of start position of decompression */
60         unsigned short pageofs_out;
61
62         /* I: page offset of inline compressed data */
63         unsigned short pageofs_in;
64
65         union {
66                 /* L: inline a certain number of bvec for bootstrap */
67                 struct z_erofs_bvset_inline bvset;
68
69                 /* I: can be used to free the pcluster by RCU. */
70                 struct rcu_head rcu;
71         };
72
73         /* I: compression algorithm format */
74         unsigned char algorithmformat;
75
76         /* L: whether partial decompression or not */
77         bool partial;
78
79         /* L: indicate several pageofs_outs or not */
80         bool multibases;
81
82         /* L: whether extra buffer allocations are best-effort */
83         bool besteffort;
84
85         /* A: compressed bvecs (can be cached or inplaced pages) */
86         struct z_erofs_bvec compressed_bvecs[];
87 };
88
89 /* the end of a chain of pclusters */
90 #define Z_EROFS_PCLUSTER_TAIL           ((void *) 0x700 + POISON_POINTER_DELTA)
91
92 struct z_erofs_decompressqueue {
93         struct super_block *sb;
94         struct z_erofs_pcluster *head;
95         atomic_t pending_bios;
96
97         union {
98                 struct completion done;
99                 struct work_struct work;
100                 struct kthread_work kthread_work;
101         } u;
102         bool eio, sync;
103 };
104
105 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
106 {
107         return !pcl->index;
108 }
109
110 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
111 {
112         return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
113 }
114
115 static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
116 {
117         return fo->mapping == MNGD_MAPPING(sbi);
118 }
119
120 #define Z_EROFS_ONSTACK_PAGES           32
121
122 /*
123  * since pclustersize is variable for big pcluster feature, introduce slab
124  * pools implementation for different pcluster sizes.
125  */
126 struct z_erofs_pcluster_slab {
127         struct kmem_cache *slab;
128         unsigned int maxpages;
129         char name[48];
130 };
131
132 #define _PCLP(n) { .maxpages = n }
133
134 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
135         _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
136         _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
137 };
138
139 struct z_erofs_bvec_iter {
140         struct page *bvpage;
141         struct z_erofs_bvset *bvset;
142         unsigned int nr, cur;
143 };
144
145 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
146 {
147         if (iter->bvpage)
148                 kunmap_local(iter->bvset);
149         return iter->bvpage;
150 }
151
152 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
153 {
154         unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
155         /* have to access nextpage in advance, otherwise it will be unmapped */
156         struct page *nextpage = iter->bvset->nextpage;
157         struct page *oldpage;
158
159         DBG_BUGON(!nextpage);
160         oldpage = z_erofs_bvec_iter_end(iter);
161         iter->bvpage = nextpage;
162         iter->bvset = kmap_local_page(nextpage);
163         iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
164         iter->cur = 0;
165         return oldpage;
166 }
167
168 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
169                                     struct z_erofs_bvset_inline *bvset,
170                                     unsigned int bootstrap_nr,
171                                     unsigned int cur)
172 {
173         *iter = (struct z_erofs_bvec_iter) {
174                 .nr = bootstrap_nr,
175                 .bvset = (struct z_erofs_bvset *)bvset,
176         };
177
178         while (cur > iter->nr) {
179                 cur -= iter->nr;
180                 z_erofs_bvset_flip(iter);
181         }
182         iter->cur = cur;
183 }
184
185 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
186                                 struct z_erofs_bvec *bvec,
187                                 struct page **candidate_bvpage,
188                                 struct page **pagepool)
189 {
190         if (iter->cur >= iter->nr) {
191                 struct page *nextpage = *candidate_bvpage;
192
193                 if (!nextpage) {
194                         nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
195                                         true);
196                         if (!nextpage)
197                                 return -ENOMEM;
198                         set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
199                 }
200                 DBG_BUGON(iter->bvset->nextpage);
201                 iter->bvset->nextpage = nextpage;
202                 z_erofs_bvset_flip(iter);
203
204                 iter->bvset->nextpage = NULL;
205                 *candidate_bvpage = NULL;
206         }
207         iter->bvset->bvec[iter->cur++] = *bvec;
208         return 0;
209 }
210
211 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
212                                  struct z_erofs_bvec *bvec,
213                                  struct page **old_bvpage)
214 {
215         if (iter->cur == iter->nr)
216                 *old_bvpage = z_erofs_bvset_flip(iter);
217         else
218                 *old_bvpage = NULL;
219         *bvec = iter->bvset->bvec[iter->cur++];
220 }
221
222 static void z_erofs_destroy_pcluster_pool(void)
223 {
224         int i;
225
226         for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
227                 if (!pcluster_pool[i].slab)
228                         continue;
229                 kmem_cache_destroy(pcluster_pool[i].slab);
230                 pcluster_pool[i].slab = NULL;
231         }
232 }
233
234 static int z_erofs_create_pcluster_pool(void)
235 {
236         struct z_erofs_pcluster_slab *pcs;
237         struct z_erofs_pcluster *a;
238         unsigned int size;
239
240         for (pcs = pcluster_pool;
241              pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
242                 size = struct_size(a, compressed_bvecs, pcs->maxpages);
243
244                 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
245                 pcs->slab = kmem_cache_create(pcs->name, size, 0,
246                                               SLAB_RECLAIM_ACCOUNT, NULL);
247                 if (pcs->slab)
248                         continue;
249
250                 z_erofs_destroy_pcluster_pool();
251                 return -ENOMEM;
252         }
253         return 0;
254 }
255
256 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
257 {
258         unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
259         struct z_erofs_pcluster_slab *pcs = pcluster_pool;
260
261         for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
262                 struct z_erofs_pcluster *pcl;
263
264                 if (nrpages > pcs->maxpages)
265                         continue;
266
267                 pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
268                 if (!pcl)
269                         return ERR_PTR(-ENOMEM);
270                 pcl->pclustersize = size;
271                 return pcl;
272         }
273         return ERR_PTR(-EINVAL);
274 }
275
276 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
277 {
278         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
279         int i;
280
281         for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
282                 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
283
284                 if (pclusterpages > pcs->maxpages)
285                         continue;
286
287                 kmem_cache_free(pcs->slab, pcl);
288                 return;
289         }
290         DBG_BUGON(1);
291 }
292
293 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
294
295 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
296 static struct kthread_worker __rcu **z_erofs_pcpu_workers;
297
298 static void erofs_destroy_percpu_workers(void)
299 {
300         struct kthread_worker *worker;
301         unsigned int cpu;
302
303         for_each_possible_cpu(cpu) {
304                 worker = rcu_dereference_protected(
305                                         z_erofs_pcpu_workers[cpu], 1);
306                 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
307                 if (worker)
308                         kthread_destroy_worker(worker);
309         }
310         kfree(z_erofs_pcpu_workers);
311 }
312
313 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
314 {
315         struct kthread_worker *worker =
316                 kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
317
318         if (IS_ERR(worker))
319                 return worker;
320         if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
321                 sched_set_fifo_low(worker->task);
322         return worker;
323 }
324
325 static int erofs_init_percpu_workers(void)
326 {
327         struct kthread_worker *worker;
328         unsigned int cpu;
329
330         z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
331                         sizeof(struct kthread_worker *), GFP_ATOMIC);
332         if (!z_erofs_pcpu_workers)
333                 return -ENOMEM;
334
335         for_each_online_cpu(cpu) {      /* could miss cpu{off,on}line? */
336                 worker = erofs_init_percpu_worker(cpu);
337                 if (!IS_ERR(worker))
338                         rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
339         }
340         return 0;
341 }
342 #else
343 static inline void erofs_destroy_percpu_workers(void) {}
344 static inline int erofs_init_percpu_workers(void) { return 0; }
345 #endif
346
347 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
348 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
349 static enum cpuhp_state erofs_cpuhp_state;
350
351 static int erofs_cpu_online(unsigned int cpu)
352 {
353         struct kthread_worker *worker, *old;
354
355         worker = erofs_init_percpu_worker(cpu);
356         if (IS_ERR(worker))
357                 return PTR_ERR(worker);
358
359         spin_lock(&z_erofs_pcpu_worker_lock);
360         old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
361                         lockdep_is_held(&z_erofs_pcpu_worker_lock));
362         if (!old)
363                 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
364         spin_unlock(&z_erofs_pcpu_worker_lock);
365         if (old)
366                 kthread_destroy_worker(worker);
367         return 0;
368 }
369
370 static int erofs_cpu_offline(unsigned int cpu)
371 {
372         struct kthread_worker *worker;
373
374         spin_lock(&z_erofs_pcpu_worker_lock);
375         worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
376                         lockdep_is_held(&z_erofs_pcpu_worker_lock));
377         rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
378         spin_unlock(&z_erofs_pcpu_worker_lock);
379
380         synchronize_rcu();
381         if (worker)
382                 kthread_destroy_worker(worker);
383         return 0;
384 }
385
386 static int erofs_cpu_hotplug_init(void)
387 {
388         int state;
389
390         state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
391                         "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
392         if (state < 0)
393                 return state;
394
395         erofs_cpuhp_state = state;
396         return 0;
397 }
398
399 static void erofs_cpu_hotplug_destroy(void)
400 {
401         if (erofs_cpuhp_state)
402                 cpuhp_remove_state_nocalls(erofs_cpuhp_state);
403 }
404 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
405 static inline int erofs_cpu_hotplug_init(void) { return 0; }
406 static inline void erofs_cpu_hotplug_destroy(void) {}
407 #endif
408
409 void z_erofs_exit_subsystem(void)
410 {
411         erofs_cpu_hotplug_destroy();
412         erofs_destroy_percpu_workers();
413         destroy_workqueue(z_erofs_workqueue);
414         z_erofs_destroy_pcluster_pool();
415         z_erofs_exit_decompressor();
416 }
417
418 int __init z_erofs_init_subsystem(void)
419 {
420         int err = z_erofs_init_decompressor();
421
422         if (err)
423                 goto err_decompressor;
424
425         err = z_erofs_create_pcluster_pool();
426         if (err)
427                 goto err_pcluster_pool;
428
429         z_erofs_workqueue = alloc_workqueue("erofs_worker",
430                         WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
431         if (!z_erofs_workqueue) {
432                 err = -ENOMEM;
433                 goto err_workqueue_init;
434         }
435
436         err = erofs_init_percpu_workers();
437         if (err)
438                 goto err_pcpu_worker;
439
440         err = erofs_cpu_hotplug_init();
441         if (err < 0)
442                 goto err_cpuhp_init;
443         return err;
444
445 err_cpuhp_init:
446         erofs_destroy_percpu_workers();
447 err_pcpu_worker:
448         destroy_workqueue(z_erofs_workqueue);
449 err_workqueue_init:
450         z_erofs_destroy_pcluster_pool();
451 err_pcluster_pool:
452         z_erofs_exit_decompressor();
453 err_decompressor:
454         return err;
455 }
456
457 enum z_erofs_pclustermode {
458         /* It has previously been linked into another processing chain */
459         Z_EROFS_PCLUSTER_INFLIGHT,
460         /*
461          * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
462          * may be dispatched to the bypass queue later due to uptodated managed
463          * folios.  All file-backed folios related to this pcluster cannot be
464          * reused for in-place I/O (or bvpage) since the pcluster may be decoded
465          * in a separate queue (and thus out of order).
466          */
467         Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
468         /*
469          * The pcluster has just been linked to our processing chain.
470          * File-backed folios (except for the head page) related to it can be
471          * used for in-place I/O (or bvpage).
472          */
473         Z_EROFS_PCLUSTER_FOLLOWED,
474 };
475
476 struct z_erofs_frontend {
477         struct inode *const inode;
478         struct erofs_map_blocks map;
479         struct z_erofs_bvec_iter biter;
480
481         struct page *pagepool;
482         struct page *candidate_bvpage;
483         struct z_erofs_pcluster *pcl, *head;
484         enum z_erofs_pclustermode mode;
485
486         erofs_off_t headoffset;
487
488         /* a pointer used to pick up inplace I/O pages */
489         unsigned int icur;
490 };
491
492 #define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
493         .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
494         .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
495
496 static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
497 {
498         unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
499
500         if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
501                 return false;
502
503         if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
504                 return true;
505
506         if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
507             fe->map.m_la < fe->headoffset)
508                 return true;
509
510         return false;
511 }
512
513 static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
514 {
515         struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
516         struct z_erofs_pcluster *pcl = fe->pcl;
517         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
518         bool shouldalloc = z_erofs_should_alloc_cache(fe);
519         bool may_bypass = true;
520         /* Optimistic allocation, as in-place I/O can be used as a fallback */
521         gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
522                         __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
523         struct folio *folio, *newfolio;
524         unsigned int i;
525
526         if (i_blocksize(fe->inode) != PAGE_SIZE ||
527             fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
528                 return;
529
530         for (i = 0; i < pclusterpages; ++i) {
531                 /* Inaccurate check w/o locking to avoid unneeded lookups */
532                 if (READ_ONCE(pcl->compressed_bvecs[i].page))
533                         continue;
534
535                 folio = filemap_get_folio(mc, pcl->index + i);
536                 if (IS_ERR(folio)) {
537                         may_bypass = false;
538                         if (!shouldalloc)
539                                 continue;
540
541                         /*
542                          * Allocate a managed folio for cached I/O, or it may be
543                          * then filled with a file-backed folio for in-place I/O
544                          */
545                         newfolio = filemap_alloc_folio(gfp, 0);
546                         if (!newfolio)
547                                 continue;
548                         newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
549                         folio = NULL;
550                 }
551                 spin_lock(&pcl->lockref.lock);
552                 if (!pcl->compressed_bvecs[i].page) {
553                         pcl->compressed_bvecs[i].page =
554                                 folio_page(folio ?: newfolio, 0);
555                         spin_unlock(&pcl->lockref.lock);
556                         continue;
557                 }
558                 spin_unlock(&pcl->lockref.lock);
559                 folio_put(folio ?: newfolio);
560         }
561
562         /*
563          * Don't perform in-place I/O if all compressed pages are available in
564          * the managed cache, as the pcluster can be moved to the bypass queue.
565          */
566         if (may_bypass)
567                 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
568 }
569
570 /* (erofs_shrinker) disconnect cached encoded data with pclusters */
571 static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
572                                                struct z_erofs_pcluster *pcl)
573 {
574         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
575         struct folio *folio;
576         int i;
577
578         DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
579         /* Each cached folio contains one page unless bs > ps is supported */
580         for (i = 0; i < pclusterpages; ++i) {
581                 if (pcl->compressed_bvecs[i].page) {
582                         folio = page_folio(pcl->compressed_bvecs[i].page);
583                         /* Avoid reclaiming or migrating this folio */
584                         if (!folio_trylock(folio))
585                                 return -EBUSY;
586
587                         if (!erofs_folio_is_managed(sbi, folio))
588                                 continue;
589                         pcl->compressed_bvecs[i].page = NULL;
590                         folio_detach_private(folio);
591                         folio_unlock(folio);
592                 }
593         }
594         return 0;
595 }
596
597 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
598 {
599         struct z_erofs_pcluster *pcl = folio_get_private(folio);
600         struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
601         struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
602         bool ret;
603
604         if (!folio_test_private(folio))
605                 return true;
606
607         ret = false;
608         spin_lock(&pcl->lockref.lock);
609         if (pcl->lockref.count <= 0) {
610                 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
611                 for (; bvec < end; ++bvec) {
612                         if (bvec->page && page_folio(bvec->page) == folio) {
613                                 bvec->page = NULL;
614                                 folio_detach_private(folio);
615                                 ret = true;
616                                 break;
617                         }
618                 }
619         }
620         spin_unlock(&pcl->lockref.lock);
621         return ret;
622 }
623
624 /*
625  * It will be called only on inode eviction. In case that there are still some
626  * decompression requests in progress, wait with rescheduling for a bit here.
627  * An extra lock could be introduced instead but it seems unnecessary.
628  */
629 static void z_erofs_cache_invalidate_folio(struct folio *folio,
630                                            size_t offset, size_t length)
631 {
632         const size_t stop = length + offset;
633
634         /* Check for potential overflow in debug mode */
635         DBG_BUGON(stop > folio_size(folio) || stop < length);
636
637         if (offset == 0 && stop == folio_size(folio))
638                 while (!z_erofs_cache_release_folio(folio, 0))
639                         cond_resched();
640 }
641
642 static const struct address_space_operations z_erofs_cache_aops = {
643         .release_folio = z_erofs_cache_release_folio,
644         .invalidate_folio = z_erofs_cache_invalidate_folio,
645 };
646
647 int erofs_init_managed_cache(struct super_block *sb)
648 {
649         struct inode *const inode = new_inode(sb);
650
651         if (!inode)
652                 return -ENOMEM;
653
654         set_nlink(inode, 1);
655         inode->i_size = OFFSET_MAX;
656         inode->i_mapping->a_ops = &z_erofs_cache_aops;
657         mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
658         EROFS_SB(sb)->managed_cache = inode;
659         return 0;
660 }
661
662 /* callers must be with pcluster lock held */
663 static int z_erofs_attach_page(struct z_erofs_frontend *fe,
664                                struct z_erofs_bvec *bvec, bool exclusive)
665 {
666         struct z_erofs_pcluster *pcl = fe->pcl;
667         int ret;
668
669         if (exclusive) {
670                 /* give priority for inplaceio to use file pages first */
671                 spin_lock(&pcl->lockref.lock);
672                 while (fe->icur > 0) {
673                         if (pcl->compressed_bvecs[--fe->icur].page)
674                                 continue;
675                         pcl->compressed_bvecs[fe->icur] = *bvec;
676                         spin_unlock(&pcl->lockref.lock);
677                         return 0;
678                 }
679                 spin_unlock(&pcl->lockref.lock);
680
681                 /* otherwise, check if it can be used as a bvpage */
682                 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
683                     !fe->candidate_bvpage)
684                         fe->candidate_bvpage = bvec->page;
685         }
686         ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
687                                    &fe->pagepool);
688         fe->pcl->vcnt += (ret >= 0);
689         return ret;
690 }
691
692 static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
693 {
694         if (lockref_get_not_zero(&pcl->lockref))
695                 return true;
696
697         spin_lock(&pcl->lockref.lock);
698         if (__lockref_is_dead(&pcl->lockref)) {
699                 spin_unlock(&pcl->lockref.lock);
700                 return false;
701         }
702
703         if (!pcl->lockref.count++)
704                 atomic_long_dec(&erofs_global_shrink_cnt);
705         spin_unlock(&pcl->lockref.lock);
706         return true;
707 }
708
709 static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
710 {
711         struct erofs_map_blocks *map = &fe->map;
712         struct super_block *sb = fe->inode->i_sb;
713         struct erofs_sb_info *sbi = EROFS_SB(sb);
714         bool ztailpacking = map->m_flags & EROFS_MAP_META;
715         struct z_erofs_pcluster *pcl, *pre;
716         int err;
717
718         if (!(map->m_flags & EROFS_MAP_ENCODED) ||
719             (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
720                 DBG_BUGON(1);
721                 return -EFSCORRUPTED;
722         }
723
724         /* no available pcluster, let's allocate one */
725         pcl = z_erofs_alloc_pcluster(map->m_plen);
726         if (IS_ERR(pcl))
727                 return PTR_ERR(pcl);
728
729         lockref_init(&pcl->lockref); /* one ref for this request */
730         pcl->algorithmformat = map->m_algorithmformat;
731         pcl->length = 0;
732         pcl->partial = true;
733         pcl->next = fe->head;
734         pcl->pageofs_out = map->m_la & ~PAGE_MASK;
735         fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
736
737         /*
738          * lock all primary followed works before visible to others
739          * and mutex_trylock *never* fails for a new pcluster.
740          */
741         mutex_init(&pcl->lock);
742         DBG_BUGON(!mutex_trylock(&pcl->lock));
743
744         if (ztailpacking) {
745                 pcl->index = 0;         /* which indicates ztailpacking */
746         } else {
747                 pcl->index = erofs_blknr(sb, map->m_pa);
748                 while (1) {
749                         xa_lock(&sbi->managed_pslots);
750                         pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
751                                            NULL, pcl, GFP_KERNEL);
752                         if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
753                                 xa_unlock(&sbi->managed_pslots);
754                                 break;
755                         }
756                         /* try to legitimize the current in-tree one */
757                         xa_unlock(&sbi->managed_pslots);
758                         cond_resched();
759                 }
760                 if (xa_is_err(pre)) {
761                         err = xa_err(pre);
762                         goto err_out;
763                 } else if (pre) {
764                         fe->pcl = pre;
765                         err = -EEXIST;
766                         goto err_out;
767                 }
768         }
769         fe->head = fe->pcl = pcl;
770         return 0;
771
772 err_out:
773         mutex_unlock(&pcl->lock);
774         z_erofs_free_pcluster(pcl);
775         return err;
776 }
777
778 static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
779 {
780         struct erofs_map_blocks *map = &fe->map;
781         struct super_block *sb = fe->inode->i_sb;
782         erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
783         struct z_erofs_pcluster *pcl = NULL;
784         int ret;
785
786         DBG_BUGON(fe->pcl);
787         /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
788         DBG_BUGON(!fe->head);
789
790         if (!(map->m_flags & EROFS_MAP_META)) {
791                 while (1) {
792                         rcu_read_lock();
793                         pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
794                         if (!pcl || z_erofs_get_pcluster(pcl)) {
795                                 DBG_BUGON(pcl && blknr != pcl->index);
796                                 rcu_read_unlock();
797                                 break;
798                         }
799                         rcu_read_unlock();
800                 }
801         } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
802                 DBG_BUGON(1);
803                 return -EFSCORRUPTED;
804         }
805
806         if (pcl) {
807                 fe->pcl = pcl;
808                 ret = -EEXIST;
809         } else {
810                 ret = z_erofs_register_pcluster(fe);
811         }
812
813         if (ret == -EEXIST) {
814                 mutex_lock(&fe->pcl->lock);
815                 /* check if this pcluster hasn't been linked into any chain. */
816                 if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
817                         /* .. so it can be attached to our submission chain */
818                         fe->head = fe->pcl;
819                         fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
820                 } else {        /* otherwise, it belongs to an inflight chain */
821                         fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
822                 }
823         } else if (ret) {
824                 return ret;
825         }
826
827         z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
828                                 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
829         if (!z_erofs_is_inline_pcluster(fe->pcl)) {
830                 /* bind cache first when cached decompression is preferred */
831                 z_erofs_bind_cache(fe);
832         } else {
833                 void *mptr;
834
835                 mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
836                 if (IS_ERR(mptr)) {
837                         ret = PTR_ERR(mptr);
838                         erofs_err(sb, "failed to get inline data %d", ret);
839                         return ret;
840                 }
841                 get_page(map->buf.page);
842                 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
843                 fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
844                 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
845         }
846         /* file-backed inplace I/O pages are traversed in reverse order */
847         fe->icur = z_erofs_pclusterpages(fe->pcl);
848         return 0;
849 }
850
851 static void z_erofs_rcu_callback(struct rcu_head *head)
852 {
853         z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
854 }
855
856 static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
857                                           struct z_erofs_pcluster *pcl)
858 {
859         if (pcl->lockref.count)
860                 return false;
861
862         /*
863          * Note that all cached folios should be detached before deleted from
864          * the XArray.  Otherwise some folios could be still attached to the
865          * orphan old pcluster when the new one is available in the tree.
866          */
867         if (erofs_try_to_free_all_cached_folios(sbi, pcl))
868                 return false;
869
870         /*
871          * It's impossible to fail after the pcluster is freezed, but in order
872          * to avoid some race conditions, add a DBG_BUGON to observe this.
873          */
874         DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
875
876         lockref_mark_dead(&pcl->lockref);
877         return true;
878 }
879
880 static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
881                                           struct z_erofs_pcluster *pcl)
882 {
883         bool free;
884
885         spin_lock(&pcl->lockref.lock);
886         free = __erofs_try_to_release_pcluster(sbi, pcl);
887         spin_unlock(&pcl->lockref.lock);
888         if (free) {
889                 atomic_long_dec(&erofs_global_shrink_cnt);
890                 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
891         }
892         return free;
893 }
894
895 unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
896 {
897         struct z_erofs_pcluster *pcl;
898         unsigned long index, freed = 0;
899
900         xa_lock(&sbi->managed_pslots);
901         xa_for_each(&sbi->managed_pslots, index, pcl) {
902                 /* try to shrink each valid pcluster */
903                 if (!erofs_try_to_release_pcluster(sbi, pcl))
904                         continue;
905                 xa_unlock(&sbi->managed_pslots);
906
907                 ++freed;
908                 if (!--nr)
909                         return freed;
910                 xa_lock(&sbi->managed_pslots);
911         }
912         xa_unlock(&sbi->managed_pslots);
913         return freed;
914 }
915
916 static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
917                 struct z_erofs_pcluster *pcl, bool try_free)
918 {
919         bool free = false;
920
921         if (lockref_put_or_lock(&pcl->lockref))
922                 return;
923
924         DBG_BUGON(__lockref_is_dead(&pcl->lockref));
925         if (!--pcl->lockref.count) {
926                 if (try_free && xa_trylock(&sbi->managed_pslots)) {
927                         free = __erofs_try_to_release_pcluster(sbi, pcl);
928                         xa_unlock(&sbi->managed_pslots);
929                 }
930                 atomic_long_add(!free, &erofs_global_shrink_cnt);
931         }
932         spin_unlock(&pcl->lockref.lock);
933         if (free)
934                 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
935 }
936
937 static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
938 {
939         struct z_erofs_pcluster *pcl = fe->pcl;
940
941         if (!pcl)
942                 return;
943
944         z_erofs_bvec_iter_end(&fe->biter);
945         mutex_unlock(&pcl->lock);
946
947         if (fe->candidate_bvpage)
948                 fe->candidate_bvpage = NULL;
949
950         /* Drop refcount if it doesn't belong to our processing chain */
951         if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
952                 z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
953         fe->pcl = NULL;
954 }
955
956 static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
957                         unsigned int cur, unsigned int end, erofs_off_t pos)
958 {
959         struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
960         struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
961         unsigned int cnt;
962         u8 *src;
963
964         if (!packed_inode)
965                 return -EFSCORRUPTED;
966
967         buf.mapping = packed_inode->i_mapping;
968         for (; cur < end; cur += cnt, pos += cnt) {
969                 cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
970                 src = erofs_bread(&buf, pos, EROFS_KMAP);
971                 if (IS_ERR(src)) {
972                         erofs_put_metabuf(&buf);
973                         return PTR_ERR(src);
974                 }
975                 memcpy_to_folio(folio, cur, src, cnt);
976         }
977         erofs_put_metabuf(&buf);
978         return 0;
979 }
980
981 static int z_erofs_scan_folio(struct z_erofs_frontend *f,
982                               struct folio *folio, bool ra)
983 {
984         struct inode *const inode = f->inode;
985         struct erofs_map_blocks *const map = &f->map;
986         const loff_t offset = folio_pos(folio);
987         const unsigned int bs = i_blocksize(inode);
988         unsigned int end = folio_size(folio), split = 0, cur, pgs;
989         bool tight, excl;
990         int err = 0;
991
992         tight = (bs == PAGE_SIZE);
993         erofs_onlinefolio_init(folio);
994         do {
995                 if (offset + end - 1 < map->m_la ||
996                     offset + end - 1 >= map->m_la + map->m_llen) {
997                         z_erofs_pcluster_end(f);
998                         map->m_la = offset + end - 1;
999                         map->m_llen = 0;
1000                         err = z_erofs_map_blocks_iter(inode, map, 0);
1001                         if (err)
1002                                 break;
1003                 }
1004
1005                 cur = offset > map->m_la ? 0 : map->m_la - offset;
1006                 pgs = round_down(cur, PAGE_SIZE);
1007                 /* bump split parts first to avoid several separate cases */
1008                 ++split;
1009
1010                 if (!(map->m_flags & EROFS_MAP_MAPPED)) {
1011                         folio_zero_segment(folio, cur, end);
1012                         tight = false;
1013                 } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
1014                         erofs_off_t fpos = offset + cur - map->m_la;
1015
1016                         err = z_erofs_read_fragment(inode->i_sb, folio, cur,
1017                                         cur + min(map->m_llen - fpos, end - cur),
1018                                         EROFS_I(inode)->z_fragmentoff + fpos);
1019                         if (err)
1020                                 break;
1021                         tight = false;
1022                 } else {
1023                         if (!f->pcl) {
1024                                 err = z_erofs_pcluster_begin(f);
1025                                 if (err)
1026                                         break;
1027                                 f->pcl->besteffort |= !ra;
1028                         }
1029
1030                         pgs = round_down(end - 1, PAGE_SIZE);
1031                         /*
1032                          * Ensure this partial page belongs to this submit chain
1033                          * rather than other concurrent submit chains or
1034                          * noio(bypass) chains since those chains are handled
1035                          * asynchronously thus it cannot be used for inplace I/O
1036                          * or bvpage (should be processed in the strict order.)
1037                          */
1038                         tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
1039                         excl = false;
1040                         if (cur <= pgs) {
1041                                 excl = (split <= 1) || tight;
1042                                 cur = pgs;
1043                         }
1044
1045                         err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
1046                                 .page = folio_page(folio, pgs >> PAGE_SHIFT),
1047                                 .offset = offset + pgs - map->m_la,
1048                                 .end = end - pgs, }), excl);
1049                         if (err)
1050                                 break;
1051
1052                         erofs_onlinefolio_split(folio);
1053                         if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
1054                                 f->pcl->multibases = true;
1055                         if (f->pcl->length < offset + end - map->m_la) {
1056                                 f->pcl->length = offset + end - map->m_la;
1057                                 f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
1058                         }
1059                         if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
1060                             !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
1061                             f->pcl->length == map->m_llen)
1062                                 f->pcl->partial = false;
1063                 }
1064                 /* shorten the remaining extent to update progress */
1065                 map->m_llen = offset + cur - map->m_la;
1066                 map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
1067                 if (cur <= pgs) {
1068                         split = cur < pgs;
1069                         tight = (bs == PAGE_SIZE);
1070                 }
1071         } while ((end = cur) > 0);
1072         erofs_onlinefolio_end(folio, err);
1073         return err;
1074 }
1075
1076 static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
1077                                        unsigned int readahead_pages)
1078 {
1079         /* auto: enable for read_folio, disable for readahead */
1080         if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
1081             !readahead_pages)
1082                 return true;
1083
1084         if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
1085             (readahead_pages <= sbi->opt.max_sync_decompress_pages))
1086                 return true;
1087
1088         return false;
1089 }
1090
1091 static bool z_erofs_page_is_invalidated(struct page *page)
1092 {
1093         return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
1094 }
1095
1096 struct z_erofs_backend {
1097         struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
1098         struct super_block *sb;
1099         struct z_erofs_pcluster *pcl;
1100
1101         /* pages with the longest decompressed length for deduplication */
1102         struct page **decompressed_pages;
1103         /* pages to keep the compressed data */
1104         struct page **compressed_pages;
1105
1106         struct list_head decompressed_secondary_bvecs;
1107         struct page **pagepool;
1108         unsigned int onstack_used, nr_pages;
1109 };
1110
1111 struct z_erofs_bvec_item {
1112         struct z_erofs_bvec bvec;
1113         struct list_head list;
1114 };
1115
1116 static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
1117                                          struct z_erofs_bvec *bvec)
1118 {
1119         struct z_erofs_bvec_item *item;
1120         unsigned int pgnr;
1121
1122         if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
1123             (bvec->end == PAGE_SIZE ||
1124              bvec->offset + bvec->end == be->pcl->length)) {
1125                 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
1126                 DBG_BUGON(pgnr >= be->nr_pages);
1127                 if (!be->decompressed_pages[pgnr]) {
1128                         be->decompressed_pages[pgnr] = bvec->page;
1129                         return;
1130                 }
1131         }
1132
1133         /* (cold path) one pcluster is requested multiple times */
1134         item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
1135         item->bvec = *bvec;
1136         list_add(&item->list, &be->decompressed_secondary_bvecs);
1137 }
1138
1139 static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
1140 {
1141         unsigned int off0 = be->pcl->pageofs_out;
1142         struct list_head *p, *n;
1143
1144         list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
1145                 struct z_erofs_bvec_item *bvi;
1146                 unsigned int end, cur;
1147                 void *dst, *src;
1148
1149                 bvi = container_of(p, struct z_erofs_bvec_item, list);
1150                 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
1151                 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
1152                             bvi->bvec.end);
1153                 dst = kmap_local_page(bvi->bvec.page);
1154                 while (cur < end) {
1155                         unsigned int pgnr, scur, len;
1156
1157                         pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
1158                         DBG_BUGON(pgnr >= be->nr_pages);
1159
1160                         scur = bvi->bvec.offset + cur -
1161                                         ((pgnr << PAGE_SHIFT) - off0);
1162                         len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
1163                         if (!be->decompressed_pages[pgnr]) {
1164                                 err = -EFSCORRUPTED;
1165                                 cur += len;
1166                                 continue;
1167                         }
1168                         src = kmap_local_page(be->decompressed_pages[pgnr]);
1169                         memcpy(dst + cur, src + scur, len);
1170                         kunmap_local(src);
1171                         cur += len;
1172                 }
1173                 kunmap_local(dst);
1174                 erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
1175                 list_del(p);
1176                 kfree(bvi);
1177         }
1178 }
1179
1180 static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
1181 {
1182         struct z_erofs_pcluster *pcl = be->pcl;
1183         struct z_erofs_bvec_iter biter;
1184         struct page *old_bvpage;
1185         int i;
1186
1187         z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
1188         for (i = 0; i < pcl->vcnt; ++i) {
1189                 struct z_erofs_bvec bvec;
1190
1191                 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
1192
1193                 if (old_bvpage)
1194                         z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1195
1196                 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
1197                 z_erofs_do_decompressed_bvec(be, &bvec);
1198         }
1199
1200         old_bvpage = z_erofs_bvec_iter_end(&biter);
1201         if (old_bvpage)
1202                 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1203 }
1204
1205 static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
1206 {
1207         struct z_erofs_pcluster *pcl = be->pcl;
1208         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1209         int i, err = 0;
1210
1211         *overlapped = false;
1212         for (i = 0; i < pclusterpages; ++i) {
1213                 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
1214                 struct page *page = bvec->page;
1215
1216                 /* compressed data ought to be valid when decompressing */
1217                 if (IS_ERR(page) || !page) {
1218                         bvec->page = NULL;      /* clear the failure reason */
1219                         err = page ? PTR_ERR(page) : -EIO;
1220                         continue;
1221                 }
1222                 be->compressed_pages[i] = page;
1223
1224                 if (z_erofs_is_inline_pcluster(pcl) ||
1225                     erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
1226                         if (!PageUptodate(page))
1227                                 err = -EIO;
1228                         continue;
1229                 }
1230
1231                 DBG_BUGON(z_erofs_page_is_invalidated(page));
1232                 if (z_erofs_is_shortlived_page(page))
1233                         continue;
1234                 z_erofs_do_decompressed_bvec(be, bvec);
1235                 *overlapped = true;
1236         }
1237         return err;
1238 }
1239
1240 static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
1241 {
1242         struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
1243         struct z_erofs_pcluster *pcl = be->pcl;
1244         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1245         const struct z_erofs_decompressor *decomp =
1246                                 z_erofs_decomp[pcl->algorithmformat];
1247         int i, j, jtop, err2;
1248         struct page *page;
1249         bool overlapped;
1250         bool try_free = true;
1251
1252         mutex_lock(&pcl->lock);
1253         be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
1254
1255         /* allocate (de)compressed page arrays if cannot be kept on stack */
1256         be->decompressed_pages = NULL;
1257         be->compressed_pages = NULL;
1258         be->onstack_used = 0;
1259         if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
1260                 be->decompressed_pages = be->onstack_pages;
1261                 be->onstack_used = be->nr_pages;
1262                 memset(be->decompressed_pages, 0,
1263                        sizeof(struct page *) * be->nr_pages);
1264         }
1265
1266         if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
1267                 be->compressed_pages = be->onstack_pages + be->onstack_used;
1268
1269         if (!be->decompressed_pages)
1270                 be->decompressed_pages =
1271                         kvcalloc(be->nr_pages, sizeof(struct page *),
1272                                  GFP_KERNEL | __GFP_NOFAIL);
1273         if (!be->compressed_pages)
1274                 be->compressed_pages =
1275                         kvcalloc(pclusterpages, sizeof(struct page *),
1276                                  GFP_KERNEL | __GFP_NOFAIL);
1277
1278         z_erofs_parse_out_bvecs(be);
1279         err2 = z_erofs_parse_in_bvecs(be, &overlapped);
1280         if (err2)
1281                 err = err2;
1282         if (!err)
1283                 err = decomp->decompress(&(struct z_erofs_decompress_req) {
1284                                         .sb = be->sb,
1285                                         .in = be->compressed_pages,
1286                                         .out = be->decompressed_pages,
1287                                         .pageofs_in = pcl->pageofs_in,
1288                                         .pageofs_out = pcl->pageofs_out,
1289                                         .inputsize = pcl->pclustersize,
1290                                         .outputsize = pcl->length,
1291                                         .alg = pcl->algorithmformat,
1292                                         .inplace_io = overlapped,
1293                                         .partial_decoding = pcl->partial,
1294                                         .fillgaps = pcl->multibases,
1295                                         .gfp = pcl->besteffort ? GFP_KERNEL :
1296                                                 GFP_NOWAIT | __GFP_NORETRY
1297                                  }, be->pagepool);
1298
1299         /* must handle all compressed pages before actual file pages */
1300         if (z_erofs_is_inline_pcluster(pcl)) {
1301                 page = pcl->compressed_bvecs[0].page;
1302                 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
1303                 put_page(page);
1304         } else {
1305                 /* managed folios are still left in compressed_bvecs[] */
1306                 for (i = 0; i < pclusterpages; ++i) {
1307                         page = be->compressed_pages[i];
1308                         if (!page)
1309                                 continue;
1310                         if (erofs_folio_is_managed(sbi, page_folio(page))) {
1311                                 try_free = false;
1312                                 continue;
1313                         }
1314                         (void)z_erofs_put_shortlivedpage(be->pagepool, page);
1315                         WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
1316                 }
1317         }
1318         if (be->compressed_pages < be->onstack_pages ||
1319             be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
1320                 kvfree(be->compressed_pages);
1321
1322         jtop = 0;
1323         z_erofs_fill_other_copies(be, err);
1324         for (i = 0; i < be->nr_pages; ++i) {
1325                 page = be->decompressed_pages[i];
1326                 if (!page)
1327                         continue;
1328
1329                 DBG_BUGON(z_erofs_page_is_invalidated(page));
1330                 if (!z_erofs_is_shortlived_page(page)) {
1331                         erofs_onlinefolio_end(page_folio(page), err);
1332                         continue;
1333                 }
1334                 if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
1335                         erofs_pagepool_add(be->pagepool, page);
1336                         continue;
1337                 }
1338                 for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
1339                         ;
1340                 if (j >= jtop)  /* this bounce page is newly detected */
1341                         be->decompressed_pages[jtop++] = page;
1342         }
1343         while (jtop)
1344                 erofs_pagepool_add(be->pagepool,
1345                                    be->decompressed_pages[--jtop]);
1346         if (be->decompressed_pages != be->onstack_pages)
1347                 kvfree(be->decompressed_pages);
1348
1349         pcl->length = 0;
1350         pcl->partial = true;
1351         pcl->multibases = false;
1352         pcl->besteffort = false;
1353         pcl->bvset.nextpage = NULL;
1354         pcl->vcnt = 0;
1355
1356         /* pcluster lock MUST be taken before the following line */
1357         WRITE_ONCE(pcl->next, NULL);
1358         mutex_unlock(&pcl->lock);
1359
1360         if (z_erofs_is_inline_pcluster(pcl))
1361                 z_erofs_free_pcluster(pcl);
1362         else
1363                 z_erofs_put_pcluster(sbi, pcl, try_free);
1364         return err;
1365 }
1366
1367 static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
1368                                     struct page **pagepool)
1369 {
1370         struct z_erofs_backend be = {
1371                 .sb = io->sb,
1372                 .pagepool = pagepool,
1373                 .decompressed_secondary_bvecs =
1374                         LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
1375                 .pcl = io->head,
1376         };
1377         struct z_erofs_pcluster *next;
1378         int err = io->eio ? -EIO : 0;
1379
1380         for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
1381                 DBG_BUGON(!be.pcl);
1382                 next = READ_ONCE(be.pcl->next);
1383                 err = z_erofs_decompress_pcluster(&be, err) ?: err;
1384         }
1385         return err;
1386 }
1387
1388 static void z_erofs_decompressqueue_work(struct work_struct *work)
1389 {
1390         struct z_erofs_decompressqueue *bgq =
1391                 container_of(work, struct z_erofs_decompressqueue, u.work);
1392         struct page *pagepool = NULL;
1393
1394         DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
1395         z_erofs_decompress_queue(bgq, &pagepool);
1396         erofs_release_pages(&pagepool);
1397         kvfree(bgq);
1398 }
1399
1400 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1401 static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
1402 {
1403         z_erofs_decompressqueue_work((struct work_struct *)work);
1404 }
1405 #endif
1406
1407 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
1408                                        int bios)
1409 {
1410         struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1411
1412         /* wake up the caller thread for sync decompression */
1413         if (io->sync) {
1414                 if (!atomic_add_return(bios, &io->pending_bios))
1415                         complete(&io->u.done);
1416                 return;
1417         }
1418
1419         if (atomic_add_return(bios, &io->pending_bios))
1420                 return;
1421         /* Use (kthread_)work and sync decompression for atomic contexts only */
1422         if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
1423 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1424                 struct kthread_worker *worker;
1425
1426                 rcu_read_lock();
1427                 worker = rcu_dereference(
1428                                 z_erofs_pcpu_workers[raw_smp_processor_id()]);
1429                 if (!worker) {
1430                         INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
1431                         queue_work(z_erofs_workqueue, &io->u.work);
1432                 } else {
1433                         kthread_queue_work(worker, &io->u.kthread_work);
1434                 }
1435                 rcu_read_unlock();
1436 #else
1437                 queue_work(z_erofs_workqueue, &io->u.work);
1438 #endif
1439                 /* enable sync decompression for readahead */
1440                 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1441                         sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1442                 return;
1443         }
1444         z_erofs_decompressqueue_work(&io->u.work);
1445 }
1446
1447 static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
1448                                  struct z_erofs_frontend *f,
1449                                  struct z_erofs_pcluster *pcl,
1450                                  unsigned int nr,
1451                                  struct address_space *mc)
1452 {
1453         gfp_t gfp = mapping_gfp_mask(mc);
1454         bool tocache = false;
1455         struct z_erofs_bvec zbv;
1456         struct address_space *mapping;
1457         struct folio *folio;
1458         struct page *page;
1459         int bs = i_blocksize(f->inode);
1460
1461         /* Except for inplace folios, the entire folio can be used for I/Os */
1462         bvec->bv_offset = 0;
1463         bvec->bv_len = PAGE_SIZE;
1464 repeat:
1465         spin_lock(&pcl->lockref.lock);
1466         zbv = pcl->compressed_bvecs[nr];
1467         spin_unlock(&pcl->lockref.lock);
1468         if (!zbv.page)
1469                 goto out_allocfolio;
1470
1471         bvec->bv_page = zbv.page;
1472         DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
1473
1474         folio = page_folio(zbv.page);
1475         /* For preallocated managed folios, add them to page cache here */
1476         if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) {
1477                 tocache = true;
1478                 goto out_tocache;
1479         }
1480
1481         mapping = READ_ONCE(folio->mapping);
1482         /*
1483          * File-backed folios for inplace I/Os are all locked steady,
1484          * therefore it is impossible for `mapping` to be NULL.
1485          */
1486         if (mapping && mapping != mc) {
1487                 if (zbv.offset < 0)
1488                         bvec->bv_offset = round_up(-zbv.offset, bs);
1489                 bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
1490                 return;
1491         }
1492
1493         folio_lock(folio);
1494         if (likely(folio->mapping == mc)) {
1495                 /*
1496                  * The cached folio is still in managed cache but without
1497                  * a valid `->private` pcluster hint.  Let's reconnect them.
1498                  */
1499                 if (!folio_test_private(folio)) {
1500                         folio_attach_private(folio, pcl);
1501                         /* compressed_bvecs[] already takes a ref before */
1502                         folio_put(folio);
1503                 }
1504                 if (likely(folio->private == pcl))  {
1505                         /* don't submit cache I/Os again if already uptodate */
1506                         if (folio_test_uptodate(folio)) {
1507                                 folio_unlock(folio);
1508                                 bvec->bv_page = NULL;
1509                         }
1510                         return;
1511                 }
1512                 /*
1513                  * Already linked with another pcluster, which only appears in
1514                  * crafted images by fuzzers for now.  But handle this anyway.
1515                  */
1516                 tocache = false;        /* use temporary short-lived pages */
1517         } else {
1518                 DBG_BUGON(1); /* referenced managed folios can't be truncated */
1519                 tocache = true;
1520         }
1521         folio_unlock(folio);
1522         folio_put(folio);
1523 out_allocfolio:
1524         page = __erofs_allocpage(&f->pagepool, gfp, true);
1525         spin_lock(&pcl->lockref.lock);
1526         if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
1527                 if (page)
1528                         erofs_pagepool_add(&f->pagepool, page);
1529                 spin_unlock(&pcl->lockref.lock);
1530                 cond_resched();
1531                 goto repeat;
1532         }
1533         pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
1534         spin_unlock(&pcl->lockref.lock);
1535         bvec->bv_page = page;
1536         if (!page)
1537                 return;
1538         folio = page_folio(page);
1539 out_tocache:
1540         if (!tocache || bs != PAGE_SIZE ||
1541             filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
1542                 /* turn into a temporary shortlived folio (1 ref) */
1543                 folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
1544                 return;
1545         }
1546         folio_attach_private(folio, pcl);
1547         /* drop a refcount added by allocpage (then 2 refs in total here) */
1548         folio_put(folio);
1549 }
1550
1551 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
1552                               struct z_erofs_decompressqueue *fgq, bool *fg)
1553 {
1554         struct z_erofs_decompressqueue *q;
1555
1556         if (fg && !*fg) {
1557                 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1558                 if (!q) {
1559                         *fg = true;
1560                         goto fg_out;
1561                 }
1562 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1563                 kthread_init_work(&q->u.kthread_work,
1564                                   z_erofs_decompressqueue_kthread_work);
1565 #else
1566                 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
1567 #endif
1568         } else {
1569 fg_out:
1570                 q = fgq;
1571                 init_completion(&fgq->u.done);
1572                 atomic_set(&fgq->pending_bios, 0);
1573                 q->eio = false;
1574                 q->sync = true;
1575         }
1576         q->sb = sb;
1577         q->head = Z_EROFS_PCLUSTER_TAIL;
1578         return q;
1579 }
1580
1581 /* define decompression jobqueue types */
1582 enum {
1583         JQ_BYPASS,
1584         JQ_SUBMIT,
1585         NR_JOBQUEUES,
1586 };
1587
1588 static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
1589                                          struct z_erofs_pcluster *next,
1590                                          struct z_erofs_pcluster **qtail[])
1591 {
1592         WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
1593         WRITE_ONCE(*qtail[JQ_SUBMIT], next);
1594         WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
1595         qtail[JQ_BYPASS] = &pcl->next;
1596 }
1597
1598 static void z_erofs_endio(struct bio *bio)
1599 {
1600         struct z_erofs_decompressqueue *q = bio->bi_private;
1601         blk_status_t err = bio->bi_status;
1602         struct folio_iter fi;
1603
1604         bio_for_each_folio_all(fi, bio) {
1605                 struct folio *folio = fi.folio;
1606
1607                 DBG_BUGON(folio_test_uptodate(folio));
1608                 DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
1609                 if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
1610                         continue;
1611
1612                 if (!err)
1613                         folio_mark_uptodate(folio);
1614                 folio_unlock(folio);
1615         }
1616         if (err)
1617                 q->eio = true;
1618         z_erofs_decompress_kickoff(q, -1);
1619         if (bio->bi_bdev)
1620                 bio_put(bio);
1621 }
1622
1623 static void z_erofs_submit_queue(struct z_erofs_frontend *f,
1624                                  struct z_erofs_decompressqueue *fgq,
1625                                  bool *force_fg, bool readahead)
1626 {
1627         struct super_block *sb = f->inode->i_sb;
1628         struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
1629         struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
1630         struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
1631         struct z_erofs_pcluster *pcl, *next;
1632         /* bio is NULL initially, so no need to initialize last_{index,bdev} */
1633         erofs_off_t last_pa;
1634         unsigned int nr_bios = 0;
1635         struct bio *bio = NULL;
1636         unsigned long pflags;
1637         int memstall = 0;
1638
1639         /* No need to read from device for pclusters in the bypass queue. */
1640         q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1641         q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
1642
1643         qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1644         qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1645
1646         /* by default, all need io submission */
1647         q[JQ_SUBMIT]->head = next = f->head;
1648
1649         do {
1650                 struct erofs_map_dev mdev;
1651                 erofs_off_t cur, end;
1652                 struct bio_vec bvec;
1653                 unsigned int i = 0;
1654                 bool bypass = true;
1655
1656                 pcl = next;
1657                 next = READ_ONCE(pcl->next);
1658                 if (z_erofs_is_inline_pcluster(pcl)) {
1659                         z_erofs_move_to_bypass_queue(pcl, next, qtail);
1660                         continue;
1661                 }
1662
1663                 /* no device id here, thus it will always succeed */
1664                 mdev = (struct erofs_map_dev) {
1665                         .m_pa = erofs_pos(sb, pcl->index),
1666                 };
1667                 (void)erofs_map_dev(sb, &mdev);
1668
1669                 cur = mdev.m_pa;
1670                 end = cur + pcl->pclustersize;
1671                 do {
1672                         bvec.bv_page = NULL;
1673                         if (bio && (cur != last_pa ||
1674                                     bio->bi_bdev != mdev.m_bdev)) {
1675 drain_io:
1676                                 if (erofs_is_fileio_mode(EROFS_SB(sb)))
1677                                         erofs_fileio_submit_bio(bio);
1678                                 else if (erofs_is_fscache_mode(sb))
1679                                         erofs_fscache_submit_bio(bio);
1680                                 else
1681                                         submit_bio(bio);
1682
1683                                 if (memstall) {
1684                                         psi_memstall_leave(&pflags);
1685                                         memstall = 0;
1686                                 }
1687                                 bio = NULL;
1688                         }
1689
1690                         if (!bvec.bv_page) {
1691                                 z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
1692                                 if (!bvec.bv_page)
1693                                         continue;
1694                                 if (cur + bvec.bv_len > end)
1695                                         bvec.bv_len = end - cur;
1696                                 DBG_BUGON(bvec.bv_len < sb->s_blocksize);
1697                         }
1698
1699                         if (unlikely(PageWorkingset(bvec.bv_page)) &&
1700                             !memstall) {
1701                                 psi_memstall_enter(&pflags);
1702                                 memstall = 1;
1703                         }
1704
1705                         if (!bio) {
1706                                 if (erofs_is_fileio_mode(EROFS_SB(sb)))
1707                                         bio = erofs_fileio_bio_alloc(&mdev);
1708                                 else if (erofs_is_fscache_mode(sb))
1709                                         bio = erofs_fscache_bio_alloc(&mdev);
1710                                 else
1711                                         bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1712                                                         REQ_OP_READ, GFP_NOIO);
1713                                 bio->bi_end_io = z_erofs_endio;
1714                                 bio->bi_iter.bi_sector = cur >> 9;
1715                                 bio->bi_private = q[JQ_SUBMIT];
1716                                 if (readahead)
1717                                         bio->bi_opf |= REQ_RAHEAD;
1718                                 ++nr_bios;
1719                         }
1720
1721                         if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
1722                                           bvec.bv_offset))
1723                                 goto drain_io;
1724                         last_pa = cur + bvec.bv_len;
1725                         bypass = false;
1726                 } while ((cur += bvec.bv_len) < end);
1727
1728                 if (!bypass)
1729                         qtail[JQ_SUBMIT] = &pcl->next;
1730                 else
1731                         z_erofs_move_to_bypass_queue(pcl, next, qtail);
1732         } while (next != Z_EROFS_PCLUSTER_TAIL);
1733
1734         if (bio) {
1735                 if (erofs_is_fileio_mode(EROFS_SB(sb)))
1736                         erofs_fileio_submit_bio(bio);
1737                 else if (erofs_is_fscache_mode(sb))
1738                         erofs_fscache_submit_bio(bio);
1739                 else
1740                         submit_bio(bio);
1741         }
1742         if (memstall)
1743                 psi_memstall_leave(&pflags);
1744
1745         /*
1746          * although background is preferred, no one is pending for submission.
1747          * don't issue decompression but drop it directly instead.
1748          */
1749         if (!*force_fg && !nr_bios) {
1750                 kvfree(q[JQ_SUBMIT]);
1751                 return;
1752         }
1753         z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
1754 }
1755
1756 static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
1757 {
1758         struct z_erofs_decompressqueue io[NR_JOBQUEUES];
1759         struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
1760         bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
1761         int err;
1762
1763         if (f->head == Z_EROFS_PCLUSTER_TAIL)
1764                 return 0;
1765         z_erofs_submit_queue(f, io, &force_fg, !!rapages);
1766
1767         /* handle bypass queue (no i/o pclusters) immediately */
1768         err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
1769         if (!force_fg)
1770                 return err;
1771
1772         /* wait until all bios are completed */
1773         wait_for_completion_io(&io[JQ_SUBMIT].u.done);
1774
1775         /* handle synchronous decompress queue in the caller context */
1776         return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
1777 }
1778
1779 /*
1780  * Since partial uptodate is still unimplemented for now, we have to use
1781  * approximate readmore strategies as a start.
1782  */
1783 static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
1784                 struct readahead_control *rac, bool backmost)
1785 {
1786         struct inode *inode = f->inode;
1787         struct erofs_map_blocks *map = &f->map;
1788         erofs_off_t cur, end, headoffset = f->headoffset;
1789         int err;
1790
1791         if (backmost) {
1792                 if (rac)
1793                         end = headoffset + readahead_length(rac) - 1;
1794                 else
1795                         end = headoffset + PAGE_SIZE - 1;
1796                 map->m_la = end;
1797                 err = z_erofs_map_blocks_iter(inode, map,
1798                                               EROFS_GET_BLOCKS_READMORE);
1799                 if (err)
1800                         return;
1801
1802                 /* expand ra for the trailing edge if readahead */
1803                 if (rac) {
1804                         cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1805                         readahead_expand(rac, headoffset, cur - headoffset);
1806                         return;
1807                 }
1808                 end = round_up(end, PAGE_SIZE);
1809         } else {
1810                 end = round_up(map->m_la, PAGE_SIZE);
1811                 if (!map->m_llen)
1812                         return;
1813         }
1814
1815         cur = map->m_la + map->m_llen - 1;
1816         while ((cur >= end) && (cur < i_size_read(inode))) {
1817                 pgoff_t index = cur >> PAGE_SHIFT;
1818                 struct folio *folio;
1819
1820                 folio = erofs_grab_folio_nowait(inode->i_mapping, index);
1821                 if (!IS_ERR_OR_NULL(folio)) {
1822                         if (folio_test_uptodate(folio))
1823                                 folio_unlock(folio);
1824                         else
1825                                 z_erofs_scan_folio(f, folio, !!rac);
1826                         folio_put(folio);
1827                 }
1828
1829                 if (cur < PAGE_SIZE)
1830                         break;
1831                 cur = (index << PAGE_SHIFT) - 1;
1832         }
1833 }
1834
1835 static int z_erofs_read_folio(struct file *file, struct folio *folio)
1836 {
1837         struct inode *const inode = folio->mapping->host;
1838         Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
1839         int err;
1840
1841         trace_erofs_read_folio(folio, false);
1842         z_erofs_pcluster_readmore(&f, NULL, true);
1843         err = z_erofs_scan_folio(&f, folio, false);
1844         z_erofs_pcluster_readmore(&f, NULL, false);
1845         z_erofs_pcluster_end(&f);
1846
1847         /* if some pclusters are ready, need submit them anyway */
1848         err = z_erofs_runqueue(&f, 0) ?: err;
1849         if (err && err != -EINTR)
1850                 erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
1851                           err, folio->index, EROFS_I(inode)->nid);
1852
1853         erofs_put_metabuf(&f.map.buf);
1854         erofs_release_pages(&f.pagepool);
1855         return err;
1856 }
1857
1858 static void z_erofs_readahead(struct readahead_control *rac)
1859 {
1860         struct inode *const inode = rac->mapping->host;
1861         Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
1862         struct folio *head = NULL, *folio;
1863         unsigned int nrpages = readahead_count(rac);
1864         int err;
1865
1866         z_erofs_pcluster_readmore(&f, rac, true);
1867         nrpages = readahead_count(rac);
1868         trace_erofs_readpages(inode, readahead_index(rac), nrpages, false);
1869         while ((folio = readahead_folio(rac))) {
1870                 folio->private = head;
1871                 head = folio;
1872         }
1873
1874         /* traverse in reverse order for best metadata I/O performance */
1875         while (head) {
1876                 folio = head;
1877                 head = folio_get_private(folio);
1878
1879                 err = z_erofs_scan_folio(&f, folio, true);
1880                 if (err && err != -EINTR)
1881                         erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
1882                                   folio->index, EROFS_I(inode)->nid);
1883         }
1884         z_erofs_pcluster_readmore(&f, rac, false);
1885         z_erofs_pcluster_end(&f);
1886
1887         (void)z_erofs_runqueue(&f, nrpages);
1888         erofs_put_metabuf(&f.map.buf);
1889         erofs_release_pages(&f.pagepool);
1890 }
1891
1892 const struct address_space_operations z_erofs_aops = {
1893         .read_folio = z_erofs_read_folio,
1894         .readahead = z_erofs_readahead,
1895 };
This page took 0.133461 seconds and 4 git commands to generate.