]> Git Repo - linux.git/blame - net/core/skbuff.c
page_pool: Allow drivers to hint on SKB recycling
[linux.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <[email protected]>
1da177e4
LT
6 * Florian La Roche <[email protected]>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
0d5501c1 61#include <linux/if_vlan.h>
2a2ea508 62#include <linux/mpls.h>
183f47fc 63#include <linux/kcov.h>
1da177e4
LT
64
65#include <net/protocol.h>
66#include <net/dst.h>
67#include <net/sock.h>
68#include <net/checksum.h>
ed1f50c3 69#include <net/ip6_checksum.h>
1da177e4 70#include <net/xfrm.h>
8822e270 71#include <net/mpls.h>
3ee17bc7 72#include <net/mptcp.h>
6a5bcd84 73#include <net/page_pool.h>
1da177e4 74
7c0f6ba6 75#include <linux/uaccess.h>
ad8d75ff 76#include <trace/events/skb.h>
51c56b00 77#include <linux/highmem.h>
b245be1f
WB
78#include <linux/capability.h>
79#include <linux/user_namespace.h>
2544af03 80#include <linux/indirect_call_wrapper.h>
a1f8e7f7 81
7b7ed885
BVA
82#include "datagram.h"
83
08009a76
AD
84struct kmem_cache *skbuff_head_cache __ro_after_init;
85static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
86#ifdef CONFIG_SKB_EXTENSIONS
87static struct kmem_cache *skbuff_ext_cache __ro_after_init;
88#endif
5f74f82e
HWR
89int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
90EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 91
1da177e4 92/**
f05de73b
JS
93 * skb_panic - private function for out-of-line support
94 * @skb: buffer
95 * @sz: size
96 * @addr: address
99d5851e 97 * @msg: skb_over_panic or skb_under_panic
1da177e4 98 *
f05de73b
JS
99 * Out-of-line support for skb_put() and skb_push().
100 * Called via the wrapper skb_over_panic() or skb_under_panic().
101 * Keep out of line to prevent kernel bloat.
102 * __builtin_return_address is not used because it is not always reliable.
1da177e4 103 */
f05de73b 104static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 105 const char msg[])
1da177e4 106{
41a46913 107 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 108 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
109 (unsigned long)skb->tail, (unsigned long)skb->end,
110 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
111 BUG();
112}
113
f05de73b 114static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 115{
f05de73b 116 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
117}
118
f05de73b
JS
119static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
120{
121 skb_panic(skb, sz, addr, __func__);
122}
c93bdd0e 123
50fad4b5 124#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
125#define NAPI_SKB_CACHE_BULK 16
126#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5
AL
127
128struct napi_alloc_cache {
129 struct page_frag_cache page;
130 unsigned int skb_count;
131 void *skb_cache[NAPI_SKB_CACHE_SIZE];
132};
133
134static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
135static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
136
137static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
138 unsigned int align_mask)
139{
140 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
141
142 return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
143}
144
145void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
146{
147 fragsz = SKB_DATA_ALIGN(fragsz);
148
149 return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
150}
151EXPORT_SYMBOL(__napi_alloc_frag_align);
152
153void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
154{
155 struct page_frag_cache *nc;
156 void *data;
157
158 fragsz = SKB_DATA_ALIGN(fragsz);
159 if (in_irq() || irqs_disabled()) {
160 nc = this_cpu_ptr(&netdev_alloc_cache);
161 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
162 } else {
163 local_bh_disable();
164 data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
165 local_bh_enable();
166 }
167 return data;
168}
169EXPORT_SYMBOL(__netdev_alloc_frag_align);
170
f450d539
AL
171static struct sk_buff *napi_skb_cache_get(void)
172{
173 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
174 struct sk_buff *skb;
175
176 if (unlikely(!nc->skb_count))
177 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
178 GFP_ATOMIC,
179 NAPI_SKB_CACHE_BULK,
180 nc->skb_cache);
181 if (unlikely(!nc->skb_count))
182 return NULL;
183
184 skb = nc->skb_cache[--nc->skb_count];
185 kasan_unpoison_object_data(skbuff_head_cache, skb);
186
187 return skb;
188}
189
ba0509b6 190/* Caller must provide SKB that is memset cleared */
483126b3
AL
191static void __build_skb_around(struct sk_buff *skb, void *data,
192 unsigned int frag_size)
ba0509b6
JDB
193{
194 struct skb_shared_info *shinfo;
195 unsigned int size = frag_size ? : ksize(data);
196
197 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
198
199 /* Assumes caller memset cleared SKB */
200 skb->truesize = SKB_TRUESIZE(size);
201 refcount_set(&skb->users, 1);
202 skb->head = data;
203 skb->data = data;
204 skb_reset_tail_pointer(skb);
205 skb->end = skb->tail + size;
206 skb->mac_header = (typeof(skb->mac_header))~0U;
207 skb->transport_header = (typeof(skb->transport_header))~0U;
208
209 /* make sure we initialize shinfo sequentially */
210 shinfo = skb_shinfo(skb);
211 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
212 atomic_set(&shinfo->dataref, 1);
213
6370cc3b 214 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
215}
216
b2b5ce9d 217/**
2ea2f62c 218 * __build_skb - build a network buffer
b2b5ce9d 219 * @data: data buffer provided by caller
2ea2f62c 220 * @frag_size: size of data, or 0 if head was kmalloced
b2b5ce9d
ED
221 *
222 * Allocate a new &sk_buff. Caller provides space holding head and
deceb4c0 223 * skb_shared_info. @data must have been allocated by kmalloc() only if
2ea2f62c
ED
224 * @frag_size is 0, otherwise data should come from the page allocator
225 * or vmalloc()
b2b5ce9d
ED
226 * The return is the new skb buffer.
227 * On a failure the return is %NULL, and @data is not freed.
228 * Notes :
229 * Before IO, driver allocates only data buffer where NIC put incoming frame
230 * Driver should add room at head (NET_SKB_PAD) and
231 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
232 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
233 * before giving packet to stack.
234 * RX rings only contains data buffers, not full skbs.
235 */
2ea2f62c 236struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 237{
b2b5ce9d 238 struct sk_buff *skb;
b2b5ce9d
ED
239
240 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
ba0509b6 241 if (unlikely(!skb))
b2b5ce9d
ED
242 return NULL;
243
b2b5ce9d 244 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 245 __build_skb_around(skb, data, frag_size);
b2b5ce9d 246
483126b3 247 return skb;
b2b5ce9d 248}
2ea2f62c
ED
249
250/* build_skb() is wrapper over __build_skb(), that specifically
251 * takes care of skb->head and skb->pfmemalloc
252 * This means that if @frag_size is not zero, then @data must be backed
253 * by a page fragment, not kmalloc() or vmalloc()
254 */
255struct sk_buff *build_skb(void *data, unsigned int frag_size)
256{
257 struct sk_buff *skb = __build_skb(data, frag_size);
258
259 if (skb && frag_size) {
260 skb->head_frag = 1;
2f064f34 261 if (page_is_pfmemalloc(virt_to_head_page(data)))
2ea2f62c
ED
262 skb->pfmemalloc = 1;
263 }
264 return skb;
265}
b2b5ce9d
ED
266EXPORT_SYMBOL(build_skb);
267
ba0509b6
JDB
268/**
269 * build_skb_around - build a network buffer around provided skb
270 * @skb: sk_buff provide by caller, must be memset cleared
271 * @data: data buffer provided by caller
272 * @frag_size: size of data, or 0 if head was kmalloced
273 */
274struct sk_buff *build_skb_around(struct sk_buff *skb,
275 void *data, unsigned int frag_size)
276{
277 if (unlikely(!skb))
278 return NULL;
279
483126b3 280 __build_skb_around(skb, data, frag_size);
ba0509b6 281
483126b3 282 if (frag_size) {
ba0509b6
JDB
283 skb->head_frag = 1;
284 if (page_is_pfmemalloc(virt_to_head_page(data)))
285 skb->pfmemalloc = 1;
286 }
287 return skb;
288}
289EXPORT_SYMBOL(build_skb_around);
290
f450d539
AL
291/**
292 * __napi_build_skb - build a network buffer
293 * @data: data buffer provided by caller
294 * @frag_size: size of data, or 0 if head was kmalloced
295 *
296 * Version of __build_skb() that uses NAPI percpu caches to obtain
297 * skbuff_head instead of inplace allocation.
298 *
299 * Returns a new &sk_buff on success, %NULL on allocation failure.
300 */
301static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
302{
303 struct sk_buff *skb;
304
305 skb = napi_skb_cache_get();
306 if (unlikely(!skb))
307 return NULL;
308
309 memset(skb, 0, offsetof(struct sk_buff, tail));
310 __build_skb_around(skb, data, frag_size);
311
312 return skb;
313}
314
315/**
316 * napi_build_skb - build a network buffer
317 * @data: data buffer provided by caller
318 * @frag_size: size of data, or 0 if head was kmalloced
319 *
320 * Version of __napi_build_skb() that takes care of skb->head_frag
321 * and skb->pfmemalloc when the data is a page or page fragment.
322 *
323 * Returns a new &sk_buff on success, %NULL on allocation failure.
324 */
325struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
326{
327 struct sk_buff *skb = __napi_build_skb(data, frag_size);
328
329 if (likely(skb) && frag_size) {
330 skb->head_frag = 1;
331 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
332 }
333
334 return skb;
335}
336EXPORT_SYMBOL(napi_build_skb);
337
5381b23d
AL
338/*
339 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
340 * the caller if emergency pfmemalloc reserves are being used. If it is and
341 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
342 * may be used. Otherwise, the packet data may be discarded until enough
343 * memory is free
344 */
ef28095f
AL
345static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
346 bool *pfmemalloc)
5381b23d
AL
347{
348 void *obj;
349 bool ret_pfmemalloc = false;
350
351 /*
352 * Try a regular allocation, when that fails and we're not entitled
353 * to the reserves, fail.
354 */
355 obj = kmalloc_node_track_caller(size,
356 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
357 node);
358 if (obj || !(gfp_pfmemalloc_allowed(flags)))
359 goto out;
360
361 /* Try again but now we are using pfmemalloc reserves */
362 ret_pfmemalloc = true;
363 obj = kmalloc_node_track_caller(size, flags, node);
364
365out:
366 if (pfmemalloc)
367 *pfmemalloc = ret_pfmemalloc;
368
369 return obj;
370}
371
372/* Allocate a new skbuff. We do this ourselves so we can fill in a few
373 * 'private' fields and also do memory statistics to find all the
374 * [BEEP] leaks.
375 *
376 */
377
378/**
379 * __alloc_skb - allocate a network buffer
380 * @size: size to allocate
381 * @gfp_mask: allocation mask
382 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
383 * instead of head cache and allocate a cloned (child) skb.
384 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
385 * allocations in case the data is required for writeback
386 * @node: numa node to allocate memory on
387 *
388 * Allocate a new &sk_buff. The returned buffer has no headroom and a
389 * tail room of at least size bytes. The object has a reference count
390 * of one. The return is the buffer. On a failure the return is %NULL.
391 *
392 * Buffers may only be allocated from interrupts using a @gfp_mask of
393 * %GFP_ATOMIC.
394 */
395struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
396 int flags, int node)
397{
398 struct kmem_cache *cache;
5381b23d
AL
399 struct sk_buff *skb;
400 u8 *data;
401 bool pfmemalloc;
402
403 cache = (flags & SKB_ALLOC_FCLONE)
404 ? skbuff_fclone_cache : skbuff_head_cache;
405
406 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
407 gfp_mask |= __GFP_MEMALLOC;
408
409 /* Get the HEAD */
d13612b5
AL
410 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
411 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
412 skb = napi_skb_cache_get();
413 else
414 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
415 if (unlikely(!skb))
416 return NULL;
5381b23d
AL
417 prefetchw(skb);
418
419 /* We do our best to align skb_shared_info on a separate cache
420 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
421 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
422 * Both skb->head and skb_shared_info are cache line aligned.
423 */
424 size = SKB_DATA_ALIGN(size);
425 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
426 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
df1ae022 427 if (unlikely(!data))
5381b23d
AL
428 goto nodata;
429 /* kmalloc(size) might give us more room than requested.
430 * Put skb_shared_info exactly at the end of allocated zone,
431 * to allow max possible filling before reallocation.
432 */
433 size = SKB_WITH_OVERHEAD(ksize(data));
434 prefetchw(data + size);
435
436 /*
437 * Only clear those fields we need to clear, not those that we will
438 * actually initialise below. Hence, don't put any more fields after
439 * the tail pointer in struct sk_buff!
440 */
441 memset(skb, 0, offsetof(struct sk_buff, tail));
f9d6725b 442 __build_skb_around(skb, data, 0);
5381b23d 443 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
444
445 if (flags & SKB_ALLOC_FCLONE) {
446 struct sk_buff_fclones *fclones;
447
448 fclones = container_of(skb, struct sk_buff_fclones, skb1);
449
450 skb->fclone = SKB_FCLONE_ORIG;
451 refcount_set(&fclones->fclone_ref, 1);
452
453 fclones->skb2.fclone = SKB_FCLONE_CLONE;
454 }
455
5381b23d 456 return skb;
df1ae022 457
5381b23d
AL
458nodata:
459 kmem_cache_free(cache, skb);
df1ae022 460 return NULL;
5381b23d
AL
461}
462EXPORT_SYMBOL(__alloc_skb);
463
fd11a83d
AD
464/**
465 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
466 * @dev: network device to receive on
d7499160 467 * @len: length to allocate
fd11a83d
AD
468 * @gfp_mask: get_free_pages mask, passed to alloc_skb
469 *
470 * Allocate a new &sk_buff and assign it a usage count of one. The
471 * buffer has NET_SKB_PAD headroom built in. Users should allocate
472 * the headroom they think they need without accounting for the
473 * built in space. The built in space is used for optimisations.
474 *
475 * %NULL is returned if there is no free memory.
476 */
9451980a
AD
477struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
478 gfp_t gfp_mask)
fd11a83d 479{
b63ae8ca 480 struct page_frag_cache *nc;
fd11a83d 481 struct sk_buff *skb;
9451980a
AD
482 bool pfmemalloc;
483 void *data;
484
485 len += NET_SKB_PAD;
fd11a83d 486
66c55602
AL
487 /* If requested length is either too small or too big,
488 * we use kmalloc() for skb->head allocation.
489 */
490 if (len <= SKB_WITH_OVERHEAD(1024) ||
491 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 492 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
493 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
494 if (!skb)
495 goto skb_fail;
496 goto skb_success;
497 }
fd11a83d 498
9451980a
AD
499 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
500 len = SKB_DATA_ALIGN(len);
501
502 if (sk_memalloc_socks())
503 gfp_mask |= __GFP_MEMALLOC;
504
92dcabd7
SAS
505 if (in_irq() || irqs_disabled()) {
506 nc = this_cpu_ptr(&netdev_alloc_cache);
507 data = page_frag_alloc(nc, len, gfp_mask);
508 pfmemalloc = nc->pfmemalloc;
509 } else {
510 local_bh_disable();
511 nc = this_cpu_ptr(&napi_alloc_cache.page);
512 data = page_frag_alloc(nc, len, gfp_mask);
513 pfmemalloc = nc->pfmemalloc;
514 local_bh_enable();
515 }
9451980a
AD
516
517 if (unlikely(!data))
518 return NULL;
519
520 skb = __build_skb(data, len);
521 if (unlikely(!skb)) {
181edb2b 522 skb_free_frag(data);
9451980a 523 return NULL;
7b2e497a 524 }
fd11a83d 525
9451980a
AD
526 if (pfmemalloc)
527 skb->pfmemalloc = 1;
528 skb->head_frag = 1;
529
a080e7bd 530skb_success:
9451980a
AD
531 skb_reserve(skb, NET_SKB_PAD);
532 skb->dev = dev;
533
a080e7bd 534skb_fail:
8af27456
CH
535 return skb;
536}
b4ac530f 537EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 538
fd11a83d
AD
539/**
540 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
541 * @napi: napi instance this buffer was allocated for
d7499160 542 * @len: length to allocate
fd11a83d
AD
543 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
544 *
545 * Allocate a new sk_buff for use in NAPI receive. This buffer will
546 * attempt to allocate the head from a special reserved region used
547 * only for NAPI Rx allocation. By doing this we can save several
548 * CPU cycles by avoiding having to disable and re-enable IRQs.
549 *
550 * %NULL is returned if there is no free memory.
551 */
9451980a
AD
552struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
553 gfp_t gfp_mask)
fd11a83d 554{
3226b158 555 struct napi_alloc_cache *nc;
fd11a83d 556 struct sk_buff *skb;
9451980a
AD
557 void *data;
558
559 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 560
3226b158
ED
561 /* If requested length is either too small or too big,
562 * we use kmalloc() for skb->head allocation.
563 */
564 if (len <= SKB_WITH_OVERHEAD(1024) ||
565 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 566 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
567 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
568 NUMA_NO_NODE);
a080e7bd
AD
569 if (!skb)
570 goto skb_fail;
571 goto skb_success;
572 }
9451980a 573
3226b158 574 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
575 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
576 len = SKB_DATA_ALIGN(len);
577
578 if (sk_memalloc_socks())
579 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 580
8c2dd3e4 581 data = page_frag_alloc(&nc->page, len, gfp_mask);
9451980a
AD
582 if (unlikely(!data))
583 return NULL;
584
cfb8ec65 585 skb = __napi_build_skb(data, len);
9451980a 586 if (unlikely(!skb)) {
181edb2b 587 skb_free_frag(data);
9451980a 588 return NULL;
fd11a83d
AD
589 }
590
795bb1c0 591 if (nc->page.pfmemalloc)
9451980a
AD
592 skb->pfmemalloc = 1;
593 skb->head_frag = 1;
594
a080e7bd 595skb_success:
9451980a
AD
596 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
597 skb->dev = napi->dev;
598
a080e7bd 599skb_fail:
fd11a83d
AD
600 return skb;
601}
602EXPORT_SYMBOL(__napi_alloc_skb);
603
654bed16 604void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 605 int size, unsigned int truesize)
654bed16
PZ
606{
607 skb_fill_page_desc(skb, i, page, off, size);
608 skb->len += size;
609 skb->data_len += size;
50269e19 610 skb->truesize += truesize;
654bed16
PZ
611}
612EXPORT_SYMBOL(skb_add_rx_frag);
613
f8e617e1
JW
614void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
615 unsigned int truesize)
616{
617 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
618
619 skb_frag_size_add(frag, size);
620 skb->len += size;
621 skb->data_len += size;
622 skb->truesize += truesize;
623}
624EXPORT_SYMBOL(skb_coalesce_rx_frag);
625
27b437c8 626static void skb_drop_list(struct sk_buff **listp)
1da177e4 627{
bd8a7036 628 kfree_skb_list(*listp);
27b437c8 629 *listp = NULL;
1da177e4
LT
630}
631
27b437c8
HX
632static inline void skb_drop_fraglist(struct sk_buff *skb)
633{
634 skb_drop_list(&skb_shinfo(skb)->frag_list);
635}
636
1da177e4
LT
637static void skb_clone_fraglist(struct sk_buff *skb)
638{
639 struct sk_buff *list;
640
fbb398a8 641 skb_walk_frags(skb, list)
1da177e4
LT
642 skb_get(list);
643}
644
d3836f21
ED
645static void skb_free_head(struct sk_buff *skb)
646{
181edb2b
AD
647 unsigned char *head = skb->head;
648
6a5bcd84
IA
649 if (skb->head_frag) {
650 if (skb_pp_recycle(skb, head))
651 return;
181edb2b 652 skb_free_frag(head);
6a5bcd84 653 } else {
181edb2b 654 kfree(head);
6a5bcd84 655 }
d3836f21
ED
656}
657
5bba1712 658static void skb_release_data(struct sk_buff *skb)
1da177e4 659{
ff04a771
ED
660 struct skb_shared_info *shinfo = skb_shinfo(skb);
661 int i;
1da177e4 662
ff04a771
ED
663 if (skb->cloned &&
664 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
665 &shinfo->dataref))
666 return;
a6686f2f 667
70c43167
JL
668 skb_zcopy_clear(skb, true);
669
ff04a771 670 for (i = 0; i < shinfo->nr_frags; i++)
6a5bcd84 671 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 672
ff04a771
ED
673 if (shinfo->frag_list)
674 kfree_skb_list(shinfo->frag_list);
675
676 skb_free_head(skb);
1da177e4
LT
677}
678
679/*
680 * Free an skbuff by memory without cleaning the state.
681 */
2d4baff8 682static void kfree_skbmem(struct sk_buff *skb)
1da177e4 683{
d0bf4a9e 684 struct sk_buff_fclones *fclones;
d179cd12 685
d179cd12
DM
686 switch (skb->fclone) {
687 case SKB_FCLONE_UNAVAILABLE:
688 kmem_cache_free(skbuff_head_cache, skb);
6ffe75eb 689 return;
d179cd12
DM
690
691 case SKB_FCLONE_ORIG:
d0bf4a9e 692 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 693
6ffe75eb
ED
694 /* We usually free the clone (TX completion) before original skb
695 * This test would have no chance to be true for the clone,
696 * while here, branch prediction will be good.
d179cd12 697 */
2638595a 698 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
699 goto fastpath;
700 break;
e7820e39 701
6ffe75eb
ED
702 default: /* SKB_FCLONE_CLONE */
703 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 704 break;
3ff50b79 705 }
2638595a 706 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
707 return;
708fastpath:
709 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
710}
711
0a463c78 712void skb_release_head_state(struct sk_buff *skb)
1da177e4 713{
adf30907 714 skb_dst_drop(skb);
9c2b3328
SH
715 if (skb->destructor) {
716 WARN_ON(in_irq());
1da177e4
LT
717 skb->destructor(skb);
718 }
a3bf7ae9 719#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 720 nf_conntrack_put(skb_nfct(skb));
1da177e4 721#endif
df5042f4 722 skb_ext_put(skb);
04a4bb55
LB
723}
724
725/* Free everything but the sk_buff shell. */
726static void skb_release_all(struct sk_buff *skb)
727{
728 skb_release_head_state(skb);
a28b1b90
FW
729 if (likely(skb->head))
730 skb_release_data(skb);
2d4baff8
HX
731}
732
733/**
734 * __kfree_skb - private function
735 * @skb: buffer
736 *
737 * Free an sk_buff. Release anything attached to the buffer.
738 * Clean the state. This is an internal helper function. Users should
739 * always call kfree_skb
740 */
1da177e4 741
2d4baff8
HX
742void __kfree_skb(struct sk_buff *skb)
743{
744 skb_release_all(skb);
1da177e4
LT
745 kfree_skbmem(skb);
746}
b4ac530f 747EXPORT_SYMBOL(__kfree_skb);
1da177e4 748