]> Git Repo - linux.git/blob - net/core/page_pool.c
arm64: avoid prototype warnings for syscalls
[linux.git] / net / core / page_pool.c
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * page_pool.c
4  *      Author: Jesper Dangaard Brouer <[email protected]>
5  *      Copyright (C) 2016 Red Hat, Inc.
6  */
7
8 #include <linux/types.h>
9 #include <linux/kernel.h>
10 #include <linux/slab.h>
11 #include <linux/device.h>
12
13 #include <net/page_pool.h>
14 #include <net/xdp.h>
15
16 #include <linux/dma-direction.h>
17 #include <linux/dma-mapping.h>
18 #include <linux/page-flags.h>
19 #include <linux/mm.h> /* for put_page() */
20 #include <linux/poison.h>
21 #include <linux/ethtool.h>
22 #include <linux/netdevice.h>
23
24 #include <trace/events/page_pool.h>
25
26 #define DEFER_TIME (msecs_to_jiffies(1000))
27 #define DEFER_WARN_INTERVAL (60 * HZ)
28
29 #define BIAS_MAX        LONG_MAX
30
31 #ifdef CONFIG_PAGE_POOL_STATS
32 /* alloc_stat_inc is intended to be used in softirq context */
33 #define alloc_stat_inc(pool, __stat)    (pool->alloc_stats.__stat++)
34 /* recycle_stat_inc is safe to use when preemption is possible. */
35 #define recycle_stat_inc(pool, __stat)                                                  \
36         do {                                                                            \
37                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
38                 this_cpu_inc(s->__stat);                                                \
39         } while (0)
40
41 #define recycle_stat_add(pool, __stat, val)                                             \
42         do {                                                                            \
43                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
44                 this_cpu_add(s->__stat, val);                                           \
45         } while (0)
46
47 static const char pp_stats[][ETH_GSTRING_LEN] = {
48         "rx_pp_alloc_fast",
49         "rx_pp_alloc_slow",
50         "rx_pp_alloc_slow_ho",
51         "rx_pp_alloc_empty",
52         "rx_pp_alloc_refill",
53         "rx_pp_alloc_waive",
54         "rx_pp_recycle_cached",
55         "rx_pp_recycle_cache_full",
56         "rx_pp_recycle_ring",
57         "rx_pp_recycle_ring_full",
58         "rx_pp_recycle_released_ref",
59 };
60
61 bool page_pool_get_stats(struct page_pool *pool,
62                          struct page_pool_stats *stats)
63 {
64         int cpu = 0;
65
66         if (!stats)
67                 return false;
68
69         /* The caller is responsible to initialize stats. */
70         stats->alloc_stats.fast += pool->alloc_stats.fast;
71         stats->alloc_stats.slow += pool->alloc_stats.slow;
72         stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
73         stats->alloc_stats.empty += pool->alloc_stats.empty;
74         stats->alloc_stats.refill += pool->alloc_stats.refill;
75         stats->alloc_stats.waive += pool->alloc_stats.waive;
76
77         for_each_possible_cpu(cpu) {
78                 const struct page_pool_recycle_stats *pcpu =
79                         per_cpu_ptr(pool->recycle_stats, cpu);
80
81                 stats->recycle_stats.cached += pcpu->cached;
82                 stats->recycle_stats.cache_full += pcpu->cache_full;
83                 stats->recycle_stats.ring += pcpu->ring;
84                 stats->recycle_stats.ring_full += pcpu->ring_full;
85                 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
86         }
87
88         return true;
89 }
90 EXPORT_SYMBOL(page_pool_get_stats);
91
92 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
93 {
94         int i;
95
96         for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
97                 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
98                 data += ETH_GSTRING_LEN;
99         }
100
101         return data;
102 }
103 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
104
105 int page_pool_ethtool_stats_get_count(void)
106 {
107         return ARRAY_SIZE(pp_stats);
108 }
109 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
110
111 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
112 {
113         struct page_pool_stats *pool_stats = stats;
114
115         *data++ = pool_stats->alloc_stats.fast;
116         *data++ = pool_stats->alloc_stats.slow;
117         *data++ = pool_stats->alloc_stats.slow_high_order;
118         *data++ = pool_stats->alloc_stats.empty;
119         *data++ = pool_stats->alloc_stats.refill;
120         *data++ = pool_stats->alloc_stats.waive;
121         *data++ = pool_stats->recycle_stats.cached;
122         *data++ = pool_stats->recycle_stats.cache_full;
123         *data++ = pool_stats->recycle_stats.ring;
124         *data++ = pool_stats->recycle_stats.ring_full;
125         *data++ = pool_stats->recycle_stats.released_refcnt;
126
127         return data;
128 }
129 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
130
131 #else
132 #define alloc_stat_inc(pool, __stat)
133 #define recycle_stat_inc(pool, __stat)
134 #define recycle_stat_add(pool, __stat, val)
135 #endif
136
137 static int page_pool_init(struct page_pool *pool,
138                           const struct page_pool_params *params)
139 {
140         unsigned int ring_qsize = 1024; /* Default */
141
142         memcpy(&pool->p, params, sizeof(pool->p));
143
144         /* Validate only known flags were used */
145         if (pool->p.flags & ~(PP_FLAG_ALL))
146                 return -EINVAL;
147
148         if (pool->p.pool_size)
149                 ring_qsize = pool->p.pool_size;
150
151         /* Sanity limit mem that can be pinned down */
152         if (ring_qsize > 32768)
153                 return -E2BIG;
154
155         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
156          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
157          * which is the XDP_TX use-case.
158          */
159         if (pool->p.flags & PP_FLAG_DMA_MAP) {
160                 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
161                     (pool->p.dma_dir != DMA_BIDIRECTIONAL))
162                         return -EINVAL;
163         }
164
165         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
166                 /* In order to request DMA-sync-for-device the page
167                  * needs to be mapped
168                  */
169                 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
170                         return -EINVAL;
171
172                 if (!pool->p.max_len)
173                         return -EINVAL;
174
175                 /* pool->p.offset has to be set according to the address
176                  * offset used by the DMA engine to start copying rx data
177                  */
178         }
179
180         if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
181             pool->p.flags & PP_FLAG_PAGE_FRAG)
182                 return -EINVAL;
183
184 #ifdef CONFIG_PAGE_POOL_STATS
185         pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
186         if (!pool->recycle_stats)
187                 return -ENOMEM;
188 #endif
189
190         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
191                 return -ENOMEM;
192
193         atomic_set(&pool->pages_state_release_cnt, 0);
194
195         /* Driver calling page_pool_create() also call page_pool_destroy() */
196         refcount_set(&pool->user_cnt, 1);
197
198         if (pool->p.flags & PP_FLAG_DMA_MAP)
199                 get_device(pool->p.dev);
200
201         return 0;
202 }
203
204 struct page_pool *page_pool_create(const struct page_pool_params *params)
205 {
206         struct page_pool *pool;
207         int err;
208
209         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
210         if (!pool)
211                 return ERR_PTR(-ENOMEM);
212
213         err = page_pool_init(pool, params);
214         if (err < 0) {
215                 pr_warn("%s() gave up with errno %d\n", __func__, err);
216                 kfree(pool);
217                 return ERR_PTR(err);
218         }
219
220         return pool;
221 }
222 EXPORT_SYMBOL(page_pool_create);
223
224 static void page_pool_return_page(struct page_pool *pool, struct page *page);
225
226 noinline
227 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
228 {
229         struct ptr_ring *r = &pool->ring;
230         struct page *page;
231         int pref_nid; /* preferred NUMA node */
232
233         /* Quicker fallback, avoid locks when ring is empty */
234         if (__ptr_ring_empty(r)) {
235                 alloc_stat_inc(pool, empty);
236                 return NULL;
237         }
238
239         /* Softirq guarantee CPU and thus NUMA node is stable. This,
240          * assumes CPU refilling driver RX-ring will also run RX-NAPI.
241          */
242 #ifdef CONFIG_NUMA
243         pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
244 #else
245         /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
246         pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
247 #endif
248
249         /* Refill alloc array, but only if NUMA match */
250         do {
251                 page = __ptr_ring_consume(r);
252                 if (unlikely(!page))
253                         break;
254
255                 if (likely(page_to_nid(page) == pref_nid)) {
256                         pool->alloc.cache[pool->alloc.count++] = page;
257                 } else {
258                         /* NUMA mismatch;
259                          * (1) release 1 page to page-allocator and
260                          * (2) break out to fallthrough to alloc_pages_node.
261                          * This limit stress on page buddy alloactor.
262                          */
263                         page_pool_return_page(pool, page);
264                         alloc_stat_inc(pool, waive);
265                         page = NULL;
266                         break;
267                 }
268         } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
269
270         /* Return last page */
271         if (likely(pool->alloc.count > 0)) {
272                 page = pool->alloc.cache[--pool->alloc.count];
273                 alloc_stat_inc(pool, refill);
274         }
275
276         return page;
277 }
278
279 /* fast path */
280 static struct page *__page_pool_get_cached(struct page_pool *pool)
281 {
282         struct page *page;
283
284         /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
285         if (likely(pool->alloc.count)) {
286                 /* Fast-path */
287                 page = pool->alloc.cache[--pool->alloc.count];
288                 alloc_stat_inc(pool, fast);
289         } else {
290                 page = page_pool_refill_alloc_cache(pool);
291         }
292
293         return page;
294 }
295
296 static void page_pool_dma_sync_for_device(struct page_pool *pool,
297                                           struct page *page,
298                                           unsigned int dma_sync_size)
299 {
300         dma_addr_t dma_addr = page_pool_get_dma_addr(page);
301
302         dma_sync_size = min(dma_sync_size, pool->p.max_len);
303         dma_sync_single_range_for_device(pool->p.dev, dma_addr,
304                                          pool->p.offset, dma_sync_size,
305                                          pool->p.dma_dir);
306 }
307
308 static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
309 {
310         dma_addr_t dma;
311
312         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
313          * since dma_addr_t can be either 32 or 64 bits and does not always fit
314          * into page private data (i.e 32bit cpu with 64bit DMA caps)
315          * This mapping is kept for lifetime of page, until leaving pool.
316          */
317         dma = dma_map_page_attrs(pool->p.dev, page, 0,
318                                  (PAGE_SIZE << pool->p.order),
319                                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
320                                                   DMA_ATTR_WEAK_ORDERING);
321         if (dma_mapping_error(pool->p.dev, dma))
322                 return false;
323
324         page_pool_set_dma_addr(page, dma);
325
326         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
327                 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
328
329         return true;
330 }
331
332 static void page_pool_set_pp_info(struct page_pool *pool,
333                                   struct page *page)
334 {
335         page->pp = pool;
336         page->pp_magic |= PP_SIGNATURE;
337         if (pool->p.init_callback)
338                 pool->p.init_callback(page, pool->p.init_arg);
339 }
340
341 static void page_pool_clear_pp_info(struct page *page)
342 {
343         page->pp_magic = 0;
344         page->pp = NULL;
345 }
346
347 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
348                                                  gfp_t gfp)
349 {
350         struct page *page;
351
352         gfp |= __GFP_COMP;
353         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
354         if (unlikely(!page))
355                 return NULL;
356
357         if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
358             unlikely(!page_pool_dma_map(pool, page))) {
359                 put_page(page);
360                 return NULL;
361         }
362
363         alloc_stat_inc(pool, slow_high_order);
364         page_pool_set_pp_info(pool, page);
365
366         /* Track how many pages are held 'in-flight' */
367         pool->pages_state_hold_cnt++;
368         trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
369         return page;
370 }
371
372 /* slow path */
373 noinline
374 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
375                                                  gfp_t gfp)
376 {
377         const int bulk = PP_ALLOC_CACHE_REFILL;
378         unsigned int pp_flags = pool->p.flags;
379         unsigned int pp_order = pool->p.order;
380         struct page *page;
381         int i, nr_pages;
382
383         /* Don't support bulk alloc for high-order pages */
384         if (unlikely(pp_order))
385                 return __page_pool_alloc_page_order(pool, gfp);
386
387         /* Unnecessary as alloc cache is empty, but guarantees zero count */
388         if (unlikely(pool->alloc.count > 0))
389                 return pool->alloc.cache[--pool->alloc.count];
390
391         /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
392         memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
393
394         nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
395                                                pool->alloc.cache);
396         if (unlikely(!nr_pages))
397                 return NULL;
398
399         /* Pages have been filled into alloc.cache array, but count is zero and
400          * page element have not been (possibly) DMA mapped.
401          */
402         for (i = 0; i < nr_pages; i++) {
403                 page = pool->alloc.cache[i];
404                 if ((pp_flags & PP_FLAG_DMA_MAP) &&
405                     unlikely(!page_pool_dma_map(pool, page))) {
406                         put_page(page);
407                         continue;
408                 }
409
410                 page_pool_set_pp_info(pool, page);
411                 pool->alloc.cache[pool->alloc.count++] = page;
412                 /* Track how many pages are held 'in-flight' */
413                 pool->pages_state_hold_cnt++;
414                 trace_page_pool_state_hold(pool, page,
415                                            pool->pages_state_hold_cnt);
416         }
417
418         /* Return last page */
419         if (likely(pool->alloc.count > 0)) {
420                 page = pool->alloc.cache[--pool->alloc.count];
421                 alloc_stat_inc(pool, slow);
422         } else {
423                 page = NULL;
424         }
425
426         /* When page just alloc'ed is should/must have refcnt 1. */
427         return page;
428 }
429
430 /* For using page_pool replace: alloc_pages() API calls, but provide
431  * synchronization guarantee for allocation side.
432  */
433 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
434 {
435         struct page *page;
436
437         /* Fast-path: Get a page from cache */
438         page = __page_pool_get_cached(pool);
439         if (page)
440                 return page;
441
442         /* Slow-path: cache empty, do real allocation */
443         page = __page_pool_alloc_pages_slow(pool, gfp);
444         return page;
445 }
446 EXPORT_SYMBOL(page_pool_alloc_pages);
447
448 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
449  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
450  */
451 #define _distance(a, b) (s32)((a) - (b))
452
453 static s32 page_pool_inflight(struct page_pool *pool)
454 {
455         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
456         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
457         s32 inflight;
458
459         inflight = _distance(hold_cnt, release_cnt);
460
461         trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
462         WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
463
464         return inflight;
465 }
466
467 /* Disconnects a page (from a page_pool).  API users can have a need
468  * to disconnect a page (from a page_pool), to allow it to be used as
469  * a regular page (that will eventually be returned to the normal
470  * page-allocator via put_page).
471  */
472 void page_pool_release_page(struct page_pool *pool, struct page *page)
473 {
474         dma_addr_t dma;
475         int count;
476
477         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
478                 /* Always account for inflight pages, even if we didn't
479                  * map them
480                  */
481                 goto skip_dma_unmap;
482
483         dma = page_pool_get_dma_addr(page);
484
485         /* When page is unmapped, it cannot be returned to our pool */
486         dma_unmap_page_attrs(pool->p.dev, dma,
487                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
488                              DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
489         page_pool_set_dma_addr(page, 0);
490 skip_dma_unmap:
491         page_pool_clear_pp_info(page);
492
493         /* This may be the last page returned, releasing the pool, so
494          * it is not safe to reference pool afterwards.
495          */
496         count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
497         trace_page_pool_state_release(pool, page, count);
498 }
499 EXPORT_SYMBOL(page_pool_release_page);
500
501 /* Return a page to the page allocator, cleaning up our state */
502 static void page_pool_return_page(struct page_pool *pool, struct page *page)
503 {
504         page_pool_release_page(pool, page);
505
506         put_page(page);
507         /* An optimization would be to call __free_pages(page, pool->p.order)
508          * knowing page is not part of page-cache (thus avoiding a
509          * __page_cache_release() call).
510          */
511 }
512
513 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
514 {
515         int ret;
516         /* BH protection not needed if current is softirq */
517         if (in_softirq())
518                 ret = ptr_ring_produce(&pool->ring, page);
519         else
520                 ret = ptr_ring_produce_bh(&pool->ring, page);
521
522         if (!ret) {
523                 recycle_stat_inc(pool, ring);
524                 return true;
525         }
526
527         return false;
528 }
529
530 /* Only allow direct recycling in special circumstances, into the
531  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
532  *
533  * Caller must provide appropriate safe context.
534  */
535 static bool page_pool_recycle_in_cache(struct page *page,
536                                        struct page_pool *pool)
537 {
538         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
539                 recycle_stat_inc(pool, cache_full);
540                 return false;
541         }
542
543         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
544         pool->alloc.cache[pool->alloc.count++] = page;
545         recycle_stat_inc(pool, cached);
546         return true;
547 }
548
549 /* If the page refcnt == 1, this will try to recycle the page.
550  * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
551  * the configured size min(dma_sync_size, pool->max_len).
552  * If the page refcnt != 1, then the page will be returned to memory
553  * subsystem.
554  */
555 static __always_inline struct page *
556 __page_pool_put_page(struct page_pool *pool, struct page *page,
557                      unsigned int dma_sync_size, bool allow_direct)
558 {
559         /* This allocator is optimized for the XDP mode that uses
560          * one-frame-per-page, but have fallbacks that act like the
561          * regular page allocator APIs.
562          *
563          * refcnt == 1 means page_pool owns page, and can recycle it.
564          *
565          * page is NOT reusable when allocated when system is under
566          * some pressure. (page_is_pfmemalloc)
567          */
568         if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
569                 /* Read barrier done in page_ref_count / READ_ONCE */
570
571                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
572                         page_pool_dma_sync_for_device(pool, page,
573                                                       dma_sync_size);
574
575                 if (allow_direct && in_softirq() &&
576                     page_pool_recycle_in_cache(page, pool))
577                         return NULL;
578
579                 /* Page found as candidate for recycling */
580                 return page;
581         }
582         /* Fallback/non-XDP mode: API user have elevated refcnt.
583          *
584          * Many drivers split up the page into fragments, and some
585          * want to keep doing this to save memory and do refcnt based
586          * recycling. Support this use case too, to ease drivers
587          * switching between XDP/non-XDP.
588          *
589          * In-case page_pool maintains the DMA mapping, API user must
590          * call page_pool_put_page once.  In this elevated refcnt
591          * case, the DMA is unmapped/released, as driver is likely
592          * doing refcnt based recycle tricks, meaning another process
593          * will be invoking put_page.
594          */
595         recycle_stat_inc(pool, released_refcnt);
596         /* Do not replace this with page_pool_return_page() */
597         page_pool_release_page(pool, page);
598         put_page(page);
599
600         return NULL;
601 }
602
603 void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
604                                   unsigned int dma_sync_size, bool allow_direct)
605 {
606         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
607         if (page && !page_pool_recycle_in_ring(pool, page)) {
608                 /* Cache full, fallback to free pages */
609                 recycle_stat_inc(pool, ring_full);
610                 page_pool_return_page(pool, page);
611         }
612 }
613 EXPORT_SYMBOL(page_pool_put_defragged_page);
614
615 /* Caller must not use data area after call, as this function overwrites it */
616 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
617                              int count)
618 {
619         int i, bulk_len = 0;
620
621         for (i = 0; i < count; i++) {
622                 struct page *page = virt_to_head_page(data[i]);
623
624                 /* It is not the last user for the page frag case */
625                 if (!page_pool_is_last_frag(pool, page))
626                         continue;
627
628                 page = __page_pool_put_page(pool, page, -1, false);
629                 /* Approved for bulk recycling in ptr_ring cache */
630                 if (page)
631                         data[bulk_len++] = page;
632         }
633
634         if (unlikely(!bulk_len))
635                 return;
636
637         /* Bulk producer into ptr_ring page_pool cache */
638         page_pool_ring_lock(pool);
639         for (i = 0; i < bulk_len; i++) {
640                 if (__ptr_ring_produce(&pool->ring, data[i])) {
641                         /* ring full */
642                         recycle_stat_inc(pool, ring_full);
643                         break;
644                 }
645         }
646         recycle_stat_add(pool, ring, i);
647         page_pool_ring_unlock(pool);
648
649         /* Hopefully all pages was return into ptr_ring */
650         if (likely(i == bulk_len))
651                 return;
652
653         /* ptr_ring cache full, free remaining pages outside producer lock
654          * since put_page() with refcnt == 1 can be an expensive operation
655          */
656         for (; i < bulk_len; i++)
657                 page_pool_return_page(pool, data[i]);
658 }
659 EXPORT_SYMBOL(page_pool_put_page_bulk);
660
661 static struct page *page_pool_drain_frag(struct page_pool *pool,
662                                          struct page *page)
663 {
664         long drain_count = BIAS_MAX - pool->frag_users;
665
666         /* Some user is still using the page frag */
667         if (likely(page_pool_defrag_page(page, drain_count)))
668                 return NULL;
669
670         if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
671                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
672                         page_pool_dma_sync_for_device(pool, page, -1);
673
674                 return page;
675         }
676
677         page_pool_return_page(pool, page);
678         return NULL;
679 }
680
681 static void page_pool_free_frag(struct page_pool *pool)
682 {
683         long drain_count = BIAS_MAX - pool->frag_users;
684         struct page *page = pool->frag_page;
685
686         pool->frag_page = NULL;
687
688         if (!page || page_pool_defrag_page(page, drain_count))
689                 return;
690
691         page_pool_return_page(pool, page);
692 }
693
694 struct page *page_pool_alloc_frag(struct page_pool *pool,
695                                   unsigned int *offset,
696                                   unsigned int size, gfp_t gfp)
697 {
698         unsigned int max_size = PAGE_SIZE << pool->p.order;
699         struct page *page = pool->frag_page;
700
701         if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
702                     size > max_size))
703                 return NULL;
704
705         size = ALIGN(size, dma_get_cache_alignment());
706         *offset = pool->frag_offset;
707
708         if (page && *offset + size > max_size) {
709                 page = page_pool_drain_frag(pool, page);
710                 if (page) {
711                         alloc_stat_inc(pool, fast);
712                         goto frag_reset;
713                 }
714         }
715
716         if (!page) {
717                 page = page_pool_alloc_pages(pool, gfp);
718                 if (unlikely(!page)) {
719                         pool->frag_page = NULL;
720                         return NULL;
721                 }
722
723                 pool->frag_page = page;
724
725 frag_reset:
726                 pool->frag_users = 1;
727                 *offset = 0;
728                 pool->frag_offset = size;
729                 page_pool_fragment_page(page, BIAS_MAX);
730                 return page;
731         }
732
733         pool->frag_users++;
734         pool->frag_offset = *offset + size;
735         alloc_stat_inc(pool, fast);
736         return page;
737 }
738 EXPORT_SYMBOL(page_pool_alloc_frag);
739
740 static void page_pool_empty_ring(struct page_pool *pool)
741 {
742         struct page *page;
743
744         /* Empty recycle ring */
745         while ((page = ptr_ring_consume_bh(&pool->ring))) {
746                 /* Verify the refcnt invariant of cached pages */
747                 if (!(page_ref_count(page) == 1))
748                         pr_crit("%s() page_pool refcnt %d violation\n",
749                                 __func__, page_ref_count(page));
750
751                 page_pool_return_page(pool, page);
752         }
753 }
754
755 static void page_pool_free(struct page_pool *pool)
756 {
757         if (pool->disconnect)
758                 pool->disconnect(pool);
759
760         ptr_ring_cleanup(&pool->ring, NULL);
761
762         if (pool->p.flags & PP_FLAG_DMA_MAP)
763                 put_device(pool->p.dev);
764
765 #ifdef CONFIG_PAGE_POOL_STATS
766         free_percpu(pool->recycle_stats);
767 #endif
768         kfree(pool);
769 }
770
771 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
772 {
773         struct page *page;
774
775         if (pool->destroy_cnt)
776                 return;
777
778         /* Empty alloc cache, assume caller made sure this is
779          * no-longer in use, and page_pool_alloc_pages() cannot be
780          * call concurrently.
781          */
782         while (pool->alloc.count) {
783                 page = pool->alloc.cache[--pool->alloc.count];
784                 page_pool_return_page(pool, page);
785         }
786 }
787
788 static void page_pool_scrub(struct page_pool *pool)
789 {
790         page_pool_empty_alloc_cache_once(pool);
791         pool->destroy_cnt++;
792
793         /* No more consumers should exist, but producers could still
794          * be in-flight.
795          */
796         page_pool_empty_ring(pool);
797 }
798
799 static int page_pool_release(struct page_pool *pool)
800 {
801         int inflight;
802
803         page_pool_scrub(pool);
804         inflight = page_pool_inflight(pool);
805         if (!inflight)
806                 page_pool_free(pool);
807
808         return inflight;
809 }
810
811 static void page_pool_release_retry(struct work_struct *wq)
812 {
813         struct delayed_work *dwq = to_delayed_work(wq);
814         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
815         int inflight;
816
817         inflight = page_pool_release(pool);
818         if (!inflight)
819                 return;
820
821         /* Periodic warning */
822         if (time_after_eq(jiffies, pool->defer_warn)) {
823                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
824
825                 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
826                         __func__, inflight, sec);
827                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
828         }
829
830         /* Still not ready to be disconnected, retry later */
831         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
832 }
833
834 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
835                            struct xdp_mem_info *mem)
836 {
837         refcount_inc(&pool->user_cnt);
838         pool->disconnect = disconnect;
839         pool->xdp_mem_id = mem->id;
840 }
841
842 void page_pool_unlink_napi(struct page_pool *pool)
843 {
844         if (!pool->p.napi)
845                 return;
846
847         /* To avoid races with recycling and additional barriers make sure
848          * pool and NAPI are unlinked when NAPI is disabled.
849          */
850         WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
851                 READ_ONCE(pool->p.napi->list_owner) != -1);
852
853         WRITE_ONCE(pool->p.napi, NULL);
854 }
855 EXPORT_SYMBOL(page_pool_unlink_napi);
856
857 void page_pool_destroy(struct page_pool *pool)
858 {
859         if (!pool)
860                 return;
861
862         if (!page_pool_put(pool))
863                 return;
864
865         page_pool_unlink_napi(pool);
866         page_pool_free_frag(pool);
867
868         if (!page_pool_release(pool))
869                 return;
870
871         pool->defer_start = jiffies;
872         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
873
874         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
875         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
876 }
877 EXPORT_SYMBOL(page_pool_destroy);
878
879 /* Caller must provide appropriate safe context, e.g. NAPI. */
880 void page_pool_update_nid(struct page_pool *pool, int new_nid)
881 {
882         struct page *page;
883
884         trace_page_pool_update_nid(pool, new_nid);
885         pool->p.nid = new_nid;
886
887         /* Flush pool alloc cache, as refill will check NUMA node */
888         while (pool->alloc.count) {
889                 page = pool->alloc.cache[--pool->alloc.count];
890                 page_pool_return_page(pool, page);
891         }
892 }
893 EXPORT_SYMBOL(page_pool_update_nid);
894
895 bool page_pool_return_skb_page(struct page *page, bool napi_safe)
896 {
897         struct napi_struct *napi;
898         struct page_pool *pp;
899         bool allow_direct;
900
901         page = compound_head(page);
902
903         /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
904          * in order to preserve any existing bits, such as bit 0 for the
905          * head page of compound page and bit 1 for pfmemalloc page, so
906          * mask those bits for freeing side when doing below checking,
907          * and page_is_pfmemalloc() is checked in __page_pool_put_page()
908          * to avoid recycling the pfmemalloc page.
909          */
910         if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
911                 return false;
912
913         pp = page->pp;
914
915         /* Allow direct recycle if we have reasons to believe that we are
916          * in the same context as the consumer would run, so there's
917          * no possible race.
918          */
919         napi = READ_ONCE(pp->p.napi);
920         allow_direct = napi_safe && napi &&
921                 READ_ONCE(napi->list_owner) == smp_processor_id();
922
923         /* Driver set this to memory recycling info. Reset it on recycle.
924          * This will *not* work for NIC using a split-page memory model.
925          * The page will be returned to the pool here regardless of the
926          * 'flipped' fragment being in use or not.
927          */
928         page_pool_put_full_page(pp, page, allow_direct);
929
930         return true;
931 }
932 EXPORT_SYMBOL(page_pool_return_skb_page);
This page took 0.09649 seconds and 4 git commands to generate.