]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Routines having to do with the 'struct sk_buff' memory handlers. | |
3 | * | |
4 | * Authors: Alan Cox <[email protected]> | |
5 | * Florian La Roche <[email protected]> | |
6 | * | |
7 | * Fixes: | |
8 | * Alan Cox : Fixed the worst of the load | |
9 | * balancer bugs. | |
10 | * Dave Platt : Interrupt stacking fix. | |
11 | * Richard Kooijman : Timestamp fixes. | |
12 | * Alan Cox : Changed buffer format. | |
13 | * Alan Cox : destructor hook for AF_UNIX etc. | |
14 | * Linus Torvalds : Better skb_clone. | |
15 | * Alan Cox : Added skb_copy. | |
16 | * Alan Cox : Added all the changed routines Linus | |
17 | * only put in the headers | |
18 | * Ray VanTassle : Fixed --skb->lock in free | |
19 | * Alan Cox : skb_copy copy arp field | |
20 | * Andi Kleen : slabified it. | |
21 | * Robert Olsson : Removed skb_head_pool | |
22 | * | |
23 | * NOTE: | |
24 | * The __skb_ routines should be called with interrupts | |
25 | * disabled, or you better be *real* sure that the operation is atomic | |
26 | * with respect to whatever list is being frobbed (e.g. via lock_sock() | |
27 | * or via disabling bottom half handlers, etc). | |
28 | * | |
29 | * This program is free software; you can redistribute it and/or | |
30 | * modify it under the terms of the GNU General Public License | |
31 | * as published by the Free Software Foundation; either version | |
32 | * 2 of the License, or (at your option) any later version. | |
33 | */ | |
34 | ||
35 | /* | |
36 | * The functions in this file will not compile correctly with gcc 2.4.x | |
37 | */ | |
38 | ||
39 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | |
40 | ||
41 | #include <linux/module.h> | |
42 | #include <linux/types.h> | |
43 | #include <linux/kernel.h> | |
44 | #include <linux/kmemcheck.h> | |
45 | #include <linux/mm.h> | |
46 | #include <linux/interrupt.h> | |
47 | #include <linux/in.h> | |
48 | #include <linux/inet.h> | |
49 | #include <linux/slab.h> | |
50 | #include <linux/tcp.h> | |
51 | #include <linux/udp.h> | |
52 | #include <linux/netdevice.h> | |
53 | #ifdef CONFIG_NET_CLS_ACT | |
54 | #include <net/pkt_sched.h> | |
55 | #endif | |
56 | #include <linux/string.h> | |
57 | #include <linux/skbuff.h> | |
58 | #include <linux/splice.h> | |
59 | #include <linux/cache.h> | |
60 | #include <linux/rtnetlink.h> | |
61 | #include <linux/init.h> | |
62 | #include <linux/scatterlist.h> | |
63 | #include <linux/errqueue.h> | |
64 | #include <linux/prefetch.h> | |
65 | #include <linux/if_vlan.h> | |
66 | ||
67 | #include <net/protocol.h> | |
68 | #include <net/dst.h> | |
69 | #include <net/sock.h> | |
70 | #include <net/checksum.h> | |
71 | #include <net/ip6_checksum.h> | |
72 | #include <net/xfrm.h> | |
73 | ||
74 | #include <asm/uaccess.h> | |
75 | #include <trace/events/skb.h> | |
76 | #include <linux/highmem.h> | |
77 | #include <linux/capability.h> | |
78 | #include <linux/user_namespace.h> | |
79 | ||
80 | struct kmem_cache *skbuff_head_cache __read_mostly; | |
81 | static struct kmem_cache *skbuff_fclone_cache __read_mostly; | |
82 | ||
83 | /** | |
84 | * skb_panic - private function for out-of-line support | |
85 | * @skb: buffer | |
86 | * @sz: size | |
87 | * @addr: address | |
88 | * @msg: skb_over_panic or skb_under_panic | |
89 | * | |
90 | * Out-of-line support for skb_put() and skb_push(). | |
91 | * Called via the wrapper skb_over_panic() or skb_under_panic(). | |
92 | * Keep out of line to prevent kernel bloat. | |
93 | * __builtin_return_address is not used because it is not always reliable. | |
94 | */ | |
95 | static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, | |
96 | const char msg[]) | |
97 | { | |
98 | pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", | |
99 | msg, addr, skb->len, sz, skb->head, skb->data, | |
100 | (unsigned long)skb->tail, (unsigned long)skb->end, | |
101 | skb->dev ? skb->dev->name : "<NULL>"); | |
102 | BUG(); | |
103 | } | |
104 | ||
105 | static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) | |
106 | { | |
107 | skb_panic(skb, sz, addr, __func__); | |
108 | } | |
109 | ||
110 | static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) | |
111 | { | |
112 | skb_panic(skb, sz, addr, __func__); | |
113 | } | |
114 | ||
115 | /* | |
116 | * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells | |
117 | * the caller if emergency pfmemalloc reserves are being used. If it is and | |
118 | * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves | |
119 | * may be used. Otherwise, the packet data may be discarded until enough | |
120 | * memory is free | |
121 | */ | |
122 | #define kmalloc_reserve(size, gfp, node, pfmemalloc) \ | |
123 | __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) | |
124 | ||
125 | static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, | |
126 | unsigned long ip, bool *pfmemalloc) | |
127 | { | |
128 | void *obj; | |
129 | bool ret_pfmemalloc = false; | |
130 | ||
131 | /* | |
132 | * Try a regular allocation, when that fails and we're not entitled | |
133 | * to the reserves, fail. | |
134 | */ | |
135 | obj = kmalloc_node_track_caller(size, | |
136 | flags | __GFP_NOMEMALLOC | __GFP_NOWARN, | |
137 | node); | |
138 | if (obj || !(gfp_pfmemalloc_allowed(flags))) | |
139 | goto out; | |
140 | ||
141 | /* Try again but now we are using pfmemalloc reserves */ | |
142 | ret_pfmemalloc = true; | |
143 | obj = kmalloc_node_track_caller(size, flags, node); | |
144 | ||
145 | out: | |
146 | if (pfmemalloc) | |
147 | *pfmemalloc = ret_pfmemalloc; | |
148 | ||
149 | return obj; | |
150 | } | |
151 | ||
152 | /* Allocate a new skbuff. We do this ourselves so we can fill in a few | |
153 | * 'private' fields and also do memory statistics to find all the | |
154 | * [BEEP] leaks. | |
155 | * | |
156 | */ | |
157 | ||
158 | struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) | |
159 | { | |
160 | struct sk_buff *skb; | |
161 | ||
162 | /* Get the HEAD */ | |
163 | skb = kmem_cache_alloc_node(skbuff_head_cache, | |
164 | gfp_mask & ~__GFP_DMA, node); | |
165 | if (!skb) | |
166 | goto out; | |
167 | ||
168 | /* | |
169 | * Only clear those fields we need to clear, not those that we will | |
170 | * actually initialise below. Hence, don't put any more fields after | |
171 | * the tail pointer in struct sk_buff! | |
172 | */ | |
173 | memset(skb, 0, offsetof(struct sk_buff, tail)); | |
174 | skb->head = NULL; | |
175 | skb->truesize = sizeof(struct sk_buff); | |
176 | atomic_set(&skb->users, 1); | |
177 | ||
178 | skb->mac_header = (typeof(skb->mac_header))~0U; | |
179 | out: | |
180 | return skb; | |
181 | } | |
182 | ||
183 | /** | |
184 | * __alloc_skb - allocate a network buffer | |
185 | * @size: size to allocate | |
186 | * @gfp_mask: allocation mask | |
187 | * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache | |
188 | * instead of head cache and allocate a cloned (child) skb. | |
189 | * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for | |
190 | * allocations in case the data is required for writeback | |
191 | * @node: numa node to allocate memory on | |
192 | * | |
193 | * Allocate a new &sk_buff. The returned buffer has no headroom and a | |
194 | * tail room of at least size bytes. The object has a reference count | |
195 | * of one. The return is the buffer. On a failure the return is %NULL. | |
196 | * | |
197 | * Buffers may only be allocated from interrupts using a @gfp_mask of | |
198 | * %GFP_ATOMIC. | |
199 | */ | |
200 | struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, | |
201 | int flags, int node) | |
202 | { | |
203 | struct kmem_cache *cache; | |
204 | struct skb_shared_info *shinfo; | |
205 | struct sk_buff *skb; | |
206 | u8 *data; | |
207 | bool pfmemalloc; | |
208 | ||
209 | cache = (flags & SKB_ALLOC_FCLONE) | |
210 | ? skbuff_fclone_cache : skbuff_head_cache; | |
211 | ||
212 | if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) | |
213 | gfp_mask |= __GFP_MEMALLOC; | |
214 | ||
215 | /* Get the HEAD */ | |
216 | skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); | |
217 | if (!skb) | |
218 | goto out; | |
219 | prefetchw(skb); | |
220 | ||
221 | /* We do our best to align skb_shared_info on a separate cache | |
222 | * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives | |
223 | * aligned memory blocks, unless SLUB/SLAB debug is enabled. | |
224 | * Both skb->head and skb_shared_info are cache line aligned. | |
225 | */ | |
226 | size = SKB_DATA_ALIGN(size); | |
227 | size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | |
228 | data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); | |
229 | if (!data) | |
230 | goto nodata; | |
231 | /* kmalloc(size) might give us more room than requested. | |
232 | * Put skb_shared_info exactly at the end of allocated zone, | |
233 | * to allow max possible filling before reallocation. | |
234 | */ | |
235 | size = SKB_WITH_OVERHEAD(ksize(data)); | |
236 | prefetchw(data + size); | |
237 | ||
238 | /* | |
239 | * Only clear those fields we need to clear, not those that we will | |
240 | * actually initialise below. Hence, don't put any more fields after | |
241 | * the tail pointer in struct sk_buff! | |
242 | */ | |
243 | memset(skb, 0, offsetof(struct sk_buff, tail)); | |
244 | /* Account for allocated memory : skb + skb->head */ | |
245 | skb->truesize = SKB_TRUESIZE(size); | |
246 | skb->pfmemalloc = pfmemalloc; | |
247 | atomic_set(&skb->users, 1); | |
248 | skb->head = data; | |
249 | skb->data = data; | |
250 | skb_reset_tail_pointer(skb); | |
251 | skb->end = skb->tail + size; | |
252 | skb->mac_header = (typeof(skb->mac_header))~0U; | |
253 | skb->transport_header = (typeof(skb->transport_header))~0U; | |
254 | ||
255 | /* make sure we initialize shinfo sequentially */ | |
256 | shinfo = skb_shinfo(skb); | |
257 | memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); | |
258 | atomic_set(&shinfo->dataref, 1); | |
259 | kmemcheck_annotate_variable(shinfo->destructor_arg); | |
260 | ||
261 | if (flags & SKB_ALLOC_FCLONE) { | |
262 | struct sk_buff_fclones *fclones; | |
263 | ||
264 | fclones = container_of(skb, struct sk_buff_fclones, skb1); | |
265 | ||
266 | kmemcheck_annotate_bitfield(&fclones->skb2, flags1); | |
267 | skb->fclone = SKB_FCLONE_ORIG; | |
268 | atomic_set(&fclones->fclone_ref, 1); | |
269 | ||
270 | fclones->skb2.fclone = SKB_FCLONE_CLONE; | |
271 | fclones->skb2.pfmemalloc = pfmemalloc; | |
272 | } | |
273 | out: | |
274 | return skb; | |
275 | nodata: | |
276 | kmem_cache_free(cache, skb); | |
277 | skb = NULL; | |
278 | goto out; | |
279 | } | |
280 | EXPORT_SYMBOL(__alloc_skb); | |
281 | ||
282 | /** | |
283 | * __build_skb - build a network buffer | |
284 | * @data: data buffer provided by caller | |
285 | * @frag_size: size of data, or 0 if head was kmalloced | |
286 | * | |
287 | * Allocate a new &sk_buff. Caller provides space holding head and | |
288 | * skb_shared_info. @data must have been allocated by kmalloc() only if | |
289 | * @frag_size is 0, otherwise data should come from the page allocator | |
290 | * or vmalloc() | |
291 | * The return is the new skb buffer. | |
292 | * On a failure the return is %NULL, and @data is not freed. | |
293 | * Notes : | |
294 | * Before IO, driver allocates only data buffer where NIC put incoming frame | |
295 | * Driver should add room at head (NET_SKB_PAD) and | |
296 | * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) | |
297 | * After IO, driver calls build_skb(), to allocate sk_buff and populate it | |
298 | * before giving packet to stack. | |
299 | * RX rings only contains data buffers, not full skbs. | |
300 | */ | |
301 | struct sk_buff *__build_skb(void *data, unsigned int frag_size) | |
302 | { | |
303 | struct skb_shared_info *shinfo; | |
304 | struct sk_buff *skb; | |
305 | unsigned int size = frag_size ? : ksize(data); | |
306 | ||
307 | skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); | |
308 | if (!skb) | |
309 | return NULL; | |
310 | ||
311 | size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | |
312 | ||
313 | memset(skb, 0, offsetof(struct sk_buff, tail)); | |
314 | skb->truesize = SKB_TRUESIZE(size); | |
315 | atomic_set(&skb->users, 1); | |
316 | skb->head = data; | |
317 | skb->data = data; | |
318 | skb_reset_tail_pointer(skb); | |
319 | skb->end = skb->tail + size; | |
320 | skb->mac_header = (typeof(skb->mac_header))~0U; | |
321 | skb->transport_header = (typeof(skb->transport_header))~0U; | |
322 | ||
323 | /* make sure we initialize shinfo sequentially */ | |
324 | shinfo = skb_shinfo(skb); | |
325 | memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); | |
326 | atomic_set(&shinfo->dataref, 1); | |
327 | kmemcheck_annotate_variable(shinfo->destructor_arg); | |
328 | ||
329 | return skb; | |
330 | } | |
331 | ||
332 | /* build_skb() is wrapper over __build_skb(), that specifically | |
333 | * takes care of skb->head and skb->pfmemalloc | |
334 | * This means that if @frag_size is not zero, then @data must be backed | |
335 | * by a page fragment, not kmalloc() or vmalloc() | |
336 | */ | |
337 | struct sk_buff *build_skb(void *data, unsigned int frag_size) | |
338 | { | |
339 | struct sk_buff *skb = __build_skb(data, frag_size); | |
340 | ||
341 | if (skb && frag_size) { | |
342 | skb->head_frag = 1; | |
343 | if (virt_to_head_page(data)->pfmemalloc) | |
344 | skb->pfmemalloc = 1; | |
345 | } | |
346 | return skb; | |
347 | } | |
348 | EXPORT_SYMBOL(build_skb); | |
349 | ||
350 | static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); | |
351 | static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); | |
352 | ||
353 | static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | |
354 | { | |
355 | struct page_frag_cache *nc; | |
356 | unsigned long flags; | |
357 | void *data; | |
358 | ||
359 | local_irq_save(flags); | |
360 | nc = this_cpu_ptr(&netdev_alloc_cache); | |
361 | data = __alloc_page_frag(nc, fragsz, gfp_mask); | |
362 | local_irq_restore(flags); | |
363 | return data; | |
364 | } | |
365 | ||
366 | /** | |
367 | * netdev_alloc_frag - allocate a page fragment | |
368 | * @fragsz: fragment size | |
369 | * | |
370 | * Allocates a frag from a page for receive buffer. | |
371 | * Uses GFP_ATOMIC allocations. | |
372 | */ | |
373 | void *netdev_alloc_frag(unsigned int fragsz) | |
374 | { | |
375 | return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); | |
376 | } | |
377 | EXPORT_SYMBOL(netdev_alloc_frag); | |
378 | ||
379 | static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | |
380 | { | |
381 | struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); | |
382 | ||
383 | return __alloc_page_frag(nc, fragsz, gfp_mask); | |
384 | } | |
385 | ||
386 | void *napi_alloc_frag(unsigned int fragsz) | |
387 | { | |
388 | return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); | |
389 | } | |
390 | EXPORT_SYMBOL(napi_alloc_frag); | |
391 | ||
392 | /** | |
393 | * __netdev_alloc_skb - allocate an skbuff for rx on a specific device | |
394 | * @dev: network device to receive on | |
395 | * @length: length to allocate | |
396 | * @gfp_mask: get_free_pages mask, passed to alloc_skb | |
397 | * | |
398 | * Allocate a new &sk_buff and assign it a usage count of one. The | |
399 | * buffer has NET_SKB_PAD headroom built in. Users should allocate | |
400 | * the headroom they think they need without accounting for the | |
401 | * built in space. The built in space is used for optimisations. | |
402 | * | |
403 | * %NULL is returned if there is no free memory. | |
404 | */ | |
405 | struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, | |
406 | gfp_t gfp_mask) | |
407 | { | |
408 | struct page_frag_cache *nc; | |
409 | unsigned long flags; | |
410 | struct sk_buff *skb; | |
411 | bool pfmemalloc; | |
412 | void *data; | |
413 | ||
414 | len += NET_SKB_PAD; | |
415 | ||
416 | if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || | |
417 | (gfp_mask & (__GFP_WAIT | GFP_DMA))) { | |
418 | skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); | |
419 | if (!skb) | |
420 | goto skb_fail; | |
421 | goto skb_success; | |
422 | } | |
423 | ||
424 | len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | |
425 | len = SKB_DATA_ALIGN(len); | |
426 | ||
427 | if (sk_memalloc_socks()) | |
428 | gfp_mask |= __GFP_MEMALLOC; | |
429 | ||
430 | local_irq_save(flags); | |
431 | ||
432 | nc = this_cpu_ptr(&netdev_alloc_cache); | |
433 | data = __alloc_page_frag(nc, len, gfp_mask); | |
434 | pfmemalloc = nc->pfmemalloc; | |
435 | ||
436 | local_irq_restore(flags); | |
437 | ||
438 | if (unlikely(!data)) | |
439 | return NULL; | |
440 | ||
441 | skb = __build_skb(data, len); | |
442 | if (unlikely(!skb)) { | |
443 | skb_free_frag(data); | |
444 | return NULL; | |
445 | } | |
446 | ||
447 | /* use OR instead of assignment to avoid clearing of bits in mask */ | |
448 | if (pfmemalloc) | |
449 | skb->pfmemalloc = 1; | |
450 | skb->head_frag = 1; | |
451 | ||
452 | skb_success: | |
453 | skb_reserve(skb, NET_SKB_PAD); | |
454 | skb->dev = dev; | |
455 | ||
456 | skb_fail: | |
457 | return skb; | |
458 | } | |
459 | EXPORT_SYMBOL(__netdev_alloc_skb); | |
460 | ||
461 | /** | |
462 | * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance | |
463 | * @napi: napi instance this buffer was allocated for | |
464 | * @length: length to allocate | |
465 | * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages | |
466 | * | |
467 | * Allocate a new sk_buff for use in NAPI receive. This buffer will | |
468 | * attempt to allocate the head from a special reserved region used | |
469 | * only for NAPI Rx allocation. By doing this we can save several | |
470 | * CPU cycles by avoiding having to disable and re-enable IRQs. | |
471 | * | |
472 | * %NULL is returned if there is no free memory. | |
473 | */ | |
474 | struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, | |
475 | gfp_t gfp_mask) | |
476 | { | |
477 | struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); | |
478 | struct sk_buff *skb; | |
479 | void *data; | |
480 | ||
481 | len += NET_SKB_PAD + NET_IP_ALIGN; | |
482 | ||
483 | if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || | |
484 | (gfp_mask & (__GFP_WAIT | GFP_DMA))) { | |
485 | skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); | |
486 | if (!skb) | |
487 | goto skb_fail; | |
488 | goto skb_success; | |
489 | } | |
490 | ||
491 | len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | |
492 | len = SKB_DATA_ALIGN(len); | |
493 | ||
494 | if (sk_memalloc_socks()) | |
495 | gfp_mask |= __GFP_MEMALLOC; | |
496 | ||
497 | data = __alloc_page_frag(nc, len, gfp_mask); | |
498 | if (unlikely(!data)) | |
499 | return NULL; | |
500 | ||
501 | skb = __build_skb(data, len); | |
502 | if (unlikely(!skb)) { | |
503 | skb_free_frag(data); | |
504 | return NULL; | |
505 | } | |
506 | ||
507 | /* use OR instead of assignment to avoid clearing of bits in mask */ | |
508 | if (nc->pfmemalloc) | |
509 | skb->pfmemalloc = 1; | |
510 | skb->head_frag = 1; | |
511 | ||
512 | skb_success: | |
513 | skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); | |
514 | skb->dev = napi->dev; | |
515 | ||
516 | skb_fail: | |
517 | return skb; | |
518 | } | |
519 | EXPORT_SYMBOL(__napi_alloc_skb); | |
520 | ||
521 | void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, | |
522 | int size, unsigned int truesize) | |
523 | { | |
524 | skb_fill_page_desc(skb, i, page, off, size); | |
525 | skb->len += size; | |
526 | skb->data_len += size; | |
527 | skb->truesize += truesize; | |
528 | } | |
529 | EXPORT_SYMBOL(skb_add_rx_frag); | |
530 | ||
531 | void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, | |
532 | unsigned int truesize) | |
533 | { | |
534 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | |
535 | ||
536 | skb_frag_size_add(frag, size); | |
537 | skb->len += size; | |
538 | skb->data_len += size; | |
539 | skb->truesize += truesize; | |
540 | } | |
541 | EXPORT_SYMBOL(skb_coalesce_rx_frag); | |
542 | ||
543 | static void skb_drop_list(struct sk_buff **listp) | |
544 | { | |
545 | kfree_skb_list(*listp); | |
546 | *listp = NULL; | |
547 | } | |
548 | ||
549 | static inline void skb_drop_fraglist(struct sk_buff *skb) | |
550 | { | |
551 | skb_drop_list(&skb_shinfo(skb)->frag_list); | |
552 | } | |
553 | ||
554 | static void skb_clone_fraglist(struct sk_buff *skb) | |
555 | { | |
556 | struct sk_buff *list; | |
557 | ||
558 | skb_walk_frags(skb, list) | |
559 | skb_get(list); | |
560 | } | |
561 | ||
562 | static void skb_free_head(struct sk_buff *skb) | |
563 | { | |
564 | unsigned char *head = skb->head; | |
565 | ||
566 | if (skb->head_frag) | |
567 | skb_free_frag(head); | |
568 | else | |
569 | kfree(head); | |
570 | } | |
571 | ||
572 | static void skb_release_data(struct sk_buff *skb) | |
573 | { | |
574 | struct skb_shared_info *shinfo = skb_shinfo(skb); | |
575 | int i; | |
576 | ||
577 | if (skb->cloned && | |
578 | atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, | |
579 | &shinfo->dataref)) | |
580 | return; | |
581 | ||
582 | for (i = 0; i < shinfo->nr_frags; i++) | |
583 | __skb_frag_unref(&shinfo->frags[i]); | |
584 | ||
585 | /* | |
586 | * If skb buf is from userspace, we need to notify the caller | |
587 | * the lower device DMA has done; | |
588 | */ | |
589 | if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { | |
590 | struct ubuf_info *uarg; | |
591 | ||
592 | uarg = shinfo->destructor_arg; | |
593 | if (uarg->callback) | |
594 | uarg->callback(uarg, true); | |
595 | } | |
596 | ||
597 | if (shinfo->frag_list) | |
598 | kfree_skb_list(shinfo->frag_list); | |
599 | ||
600 | skb_free_head(skb); | |
601 | } | |
602 | ||
603 | /* | |
604 | * Free an skbuff by memory without cleaning the state. | |
605 | */ | |
606 | static void kfree_skbmem(struct sk_buff *skb) | |
607 | { | |
608 | struct sk_buff_fclones *fclones; | |
609 | ||
610 | switch (skb->fclone) { | |
611 | case SKB_FCLONE_UNAVAILABLE: | |
612 | kmem_cache_free(skbuff_head_cache, skb); | |
613 | return; | |
614 | ||
615 | case SKB_FCLONE_ORIG: | |
616 | fclones = container_of(skb, struct sk_buff_fclones, skb1); | |
617 | ||
618 | /* We usually free the clone (TX completion) before original skb | |
619 | * This test would have no chance to be true for the clone, | |
620 | * while here, branch prediction will be good. | |
621 | */ | |
622 | if (atomic_read(&fclones->fclone_ref) == 1) | |
623 | goto fastpath; | |
624 | break; | |
625 | ||
626 | default: /* SKB_FCLONE_CLONE */ | |
627 | fclones = container_of(skb, struct sk_buff_fclones, skb2); | |
628 | break; | |
629 | } | |
630 | if (!atomic_dec_and_test(&fclones->fclone_ref)) | |
631 | return; | |
632 | fastpath: | |
633 | kmem_cache_free(skbuff_fclone_cache, fclones); | |
634 | } | |
635 | ||
636 | static void skb_release_head_state(struct sk_buff *skb) | |
637 | { | |
638 | skb_dst_drop(skb); | |
639 | #ifdef CONFIG_XFRM | |
640 | secpath_put(skb->sp); | |
641 | #endif | |
642 | if (skb->destructor) { | |
643 | WARN_ON(in_irq()); | |
644 | skb->destructor(skb); | |
645 | } | |
646 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) | |
647 | nf_conntrack_put(skb->nfct); | |
648 | #endif | |
649 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) | |
650 | nf_bridge_put(skb->nf_bridge); | |
651 | #endif | |
652 | } | |
653 | ||
654 | /* Free everything but the sk_buff shell. */ | |
655 | static void skb_release_all(struct sk_buff *skb) | |
656 | { | |
657 | skb_release_head_state(skb); | |
658 | if (likely(skb->head)) | |
659 | skb_release_data(skb); | |
660 | } | |
661 | ||
662 | /** | |
663 | * __kfree_skb - private function | |
664 | * @skb: buffer | |
665 | * | |
666 | * Free an sk_buff. Release anything attached to the buffer. | |
667 | * Clean the state. This is an internal helper function. Users should | |
668 | * always call kfree_skb | |
669 | */ | |
670 | ||
671 | void __kfree_skb(struct sk_buff *skb) | |
672 | { | |
673 | skb_release_all(skb); | |
674 | kfree_skbmem(skb); | |
675 | } | |
676 | EXPORT_SYMBOL(__kfree_skb); | |
677 | ||
678 | /** | |
679 | * kfree_skb - free an sk_buff | |
680 | * @skb: buffer to free | |
681 | * | |
682 | * Drop a reference to the buffer and free it if the usage count has | |
683 | * hit zero. | |
684 | */ | |
685 | void kfree_skb(struct sk_buff *skb) | |
686 | { | |
687 | if (unlikely(!skb)) | |
688 | return; | |
689 | if (likely(atomic_read(&skb->users) == 1)) | |
690 | smp_rmb(); | |
691 | else if (likely(!atomic_dec_and_test(&skb->users))) | |
692 | return; | |
693 | trace_kfree_skb(skb, __builtin_return_address(0)); | |
694 | __kfree_skb(skb); | |
695 | } | |
696 | EXPORT_SYMBOL(kfree_skb); | |
697 | ||
698 | void kfree_skb_list(struct sk_buff *segs) | |
699 | { | |
700 | while (segs) { | |
701 | struct sk_buff *next = segs->next; | |
702 | ||
703 | kfree_skb(segs); | |
704 | segs = next; | |
705 | } | |
706 | } | |
707 | EXPORT_SYMBOL(kfree_skb_list); | |
708 | ||
709 | /** | |
710 | * skb_tx_error - report an sk_buff xmit error | |
711 | * @skb: buffer that triggered an error | |
712 | * | |
713 | * Report xmit error if a device callback is tracking this skb. | |
714 | * skb must be freed afterwards. | |
715 | */ | |
716 | void skb_tx_error(struct sk_buff *skb) | |
717 | { | |
718 | if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { | |
719 | struct ubuf_info *uarg; | |
720 | ||
721 | uarg = skb_shinfo(skb)->destructor_arg; | |
722 | if (uarg->callback) | |
723 | uarg->callback(uarg, false); | |
724 | skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; | |
725 | } | |
726 | } | |
727 | EXPORT_SYMBOL(skb_tx_error); | |
728 | ||
729 | /** | |
730 | * consume_skb - free an skbuff | |
731 | * @skb: buffer to free | |
732 | * | |
733 | * Drop a ref to the buffer and free it if the usage count has hit zero | |
734 | * Functions identically to kfree_skb, but kfree_skb assumes that the frame | |
735 | * is being dropped after a failure and notes that | |
736 | */ | |
737 | void consume_skb(struct sk_buff *skb) | |
738 | { | |
739 | if (unlikely(!skb)) | |
740 | return; | |
741 | if (likely(atomic_read(&skb->users) == 1)) | |
742 | smp_rmb(); | |
743 | else if (likely(!atomic_dec_and_test(&skb->users))) | |
744 | return; | |
745 | trace_consume_skb(skb); | |
746 | __kfree_skb(skb); | |
747 | } | |
748 | EXPORT_SYMBOL(consume_skb); | |
749 | ||
750 | /* Make sure a field is enclosed inside headers_start/headers_end section */ | |
751 | #define CHECK_SKB_FIELD(field) \ | |
752 | BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ | |
753 | offsetof(struct sk_buff, headers_start)); \ | |
754 | BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ | |
755 | offsetof(struct sk_buff, headers_end)); \ | |
756 | ||
757 | static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |
758 | { | |
759 | new->tstamp = old->tstamp; | |
760 | /* We do not copy old->sk */ | |
761 | new->dev = old->dev; | |
762 | memcpy(new->cb, old->cb, sizeof(old->cb)); | |
763 | skb_dst_copy(new, old); | |
764 | #ifdef CONFIG_XFRM | |
765 | new->sp = secpath_get(old->sp); | |
766 | #endif | |
767 | __nf_copy(new, old, false); | |
768 | ||
769 | /* Note : this field could be in headers_start/headers_end section | |
770 | * It is not yet because we do not want to have a 16 bit hole | |
771 | */ | |
772 | new->queue_mapping = old->queue_mapping; | |
773 | ||
774 | memcpy(&new->headers_start, &old->headers_start, | |
775 | offsetof(struct sk_buff, headers_end) - | |
776 | offsetof(struct sk_buff, headers_start)); | |
777 | CHECK_SKB_FIELD(protocol); | |
778 | CHECK_SKB_FIELD(csum); | |
779 | CHECK_SKB_FIELD(hash); | |
780 | CHECK_SKB_FIELD(priority); | |
781 | CHECK_SKB_FIELD(skb_iif); | |
782 | CHECK_SKB_FIELD(vlan_proto); | |
783 | CHECK_SKB_FIELD(vlan_tci); | |
784 | CHECK_SKB_FIELD(transport_header); | |
785 | CHECK_SKB_FIELD(network_header); | |
786 | CHECK_SKB_FIELD(mac_header); | |
787 | CHECK_SKB_FIELD(inner_protocol); | |
788 | CHECK_SKB_FIELD(inner_transport_header); | |
789 | CHECK_SKB_FIELD(inner_network_header); | |
790 | CHECK_SKB_FIELD(inner_mac_header); | |
791 | CHECK_SKB_FIELD(mark); | |
792 | #ifdef CONFIG_NETWORK_SECMARK | |
793 | CHECK_SKB_FIELD(secmark); | |
794 | #endif | |
795 | #ifdef CONFIG_NET_RX_BUSY_POLL | |
796 | CHECK_SKB_FIELD(napi_id); | |
797 | #endif | |
798 | #ifdef CONFIG_XPS | |
799 | CHECK_SKB_FIELD(sender_cpu); | |
800 | #endif | |
801 | #ifdef CONFIG_NET_SCHED | |
802 | CHECK_SKB_FIELD(tc_index); | |
803 | #ifdef CONFIG_NET_CLS_ACT | |
804 | CHECK_SKB_FIELD(tc_verd); | |
805 | #endif | |
806 | #endif | |
807 | ||
808 | } | |
809 | ||
810 | /* | |
811 | * You should not add any new code to this function. Add it to | |
812 | * __copy_skb_header above instead. | |
813 | */ | |
814 | static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) | |
815 | { | |
816 | #define C(x) n->x = skb->x | |
817 | ||
818 | n->next = n->prev = NULL; | |
819 | n->sk = NULL; | |
820 | __copy_skb_header(n, skb); | |
821 | ||
822 | C(len); | |
823 | C(data_len); | |
824 | C(mac_len); | |
825 | n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; | |
826 | n->cloned = 1; | |
827 | n->nohdr = 0; | |
828 | n->destructor = NULL; | |
829 | C(tail); | |
830 | C(end); | |
831 | C(head); | |
832 | C(head_frag); | |
833 | C(data); | |
834 | C(truesize); | |
835 | atomic_set(&n->users, 1); | |
836 | ||
837 | atomic_inc(&(skb_shinfo(skb)->dataref)); | |
838 | skb->cloned = 1; | |
839 | ||
840 | return n; | |
841 | #undef C | |
842 | } | |
843 | ||
844 | /** | |
845 | * skb_morph - morph one skb into another | |
846 | * @dst: the skb to receive the contents | |
847 | * @src: the skb to supply the contents | |
848 | * | |
849 | * This is identical to skb_clone except that the target skb is | |
850 | * supplied by the user. | |
851 | * | |
852 | * The target skb is returned upon exit. | |
853 | */ | |
854 | struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) | |
855 | { | |
856 | skb_release_all(dst); | |
857 | return __skb_clone(dst, src); | |
858 | } | |
859 | EXPORT_SYMBOL_GPL(skb_morph); | |
860 | ||
861 | /** | |
862 | * skb_copy_ubufs - copy userspace skb frags buffers to kernel | |
863 | * @skb: the skb to modify | |
864 | * @gfp_mask: allocation priority | |
865 | * | |
866 | * This must be called on SKBTX_DEV_ZEROCOPY skb. | |
867 | * It will copy all frags into kernel and drop the reference | |
868 | * to userspace pages. | |
869 | * | |
870 | * If this function is called from an interrupt gfp_mask() must be | |
871 | * %GFP_ATOMIC. | |
872 | * | |
873 | * Returns 0 on success or a negative error code on failure | |
874 | * to allocate kernel memory to copy to. | |
875 | */ | |
876 | int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) | |
877 | { | |
878 | int i; | |
879 | int num_frags = skb_shinfo(skb)->nr_frags; | |
880 | struct page *page, *head = NULL; | |
881 | struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; | |
882 | ||
883 | for (i = 0; i < num_frags; i++) { | |
884 | u8 *vaddr; | |
885 | skb_frag_t *f = &skb_shinfo(skb)->frags[i]; | |
886 | ||
887 | page = alloc_page(gfp_mask); | |
888 | if (!page) { | |
889 | while (head) { | |
890 | struct page *next = (struct page *)page_private(head); | |
891 | put_page(head); | |
892 | head = next; | |
893 | } | |
894 | return -ENOMEM; | |
895 | } | |
896 | vaddr = kmap_atomic(skb_frag_page(f)); | |
897 | memcpy(page_address(page), | |
898 | vaddr + f->page_offset, skb_frag_size(f)); | |
899 | kunmap_atomic(vaddr); | |
900 | set_page_private(page, (unsigned long)head); | |
901 | head = page; | |
902 | } | |
903 | ||
904 | /* skb frags release userspace buffers */ | |
905 | for (i = 0; i < num_frags; i++) | |
906 | skb_frag_unref(skb, i); | |
907 | ||
908 | uarg->callback(uarg, false); | |
909 | ||
910 | /* skb frags point to kernel buffers */ | |
911 | for (i = num_frags - 1; i >= 0; i--) { | |
912 | __skb_fill_page_desc(skb, i, head, 0, | |
913 | skb_shinfo(skb)->frags[i].size); | |
914 | head = (struct page *)page_private(head); | |
915 | } | |
916 | ||
917 | skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; | |
918 | return 0; | |
919 | } | |
920 | EXPORT_SYMBOL_GPL(skb_copy_ubufs); | |
921 | ||
922 | /** | |
923 | * skb_clone - duplicate an sk_buff | |
924 | * @skb: buffer to clone | |
925 | * @gfp_mask: allocation priority | |
926 | * | |
927 | * Duplicate an &sk_buff. The new one is not owned by a socket. Both | |
928 | * copies share the same packet data but not structure. The new | |
929 | * buffer has a reference count of 1. If the allocation fails the | |
930 | * function returns %NULL otherwise the new buffer is returned. | |
931 | * | |
932 | * If this function is called from an interrupt gfp_mask() must be | |
933 | * %GFP_ATOMIC. | |
934 | */ | |
935 | ||
936 | struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) | |
937 | { | |
938 | struct sk_buff_fclones *fclones = container_of(skb, | |
939 | struct sk_buff_fclones, | |
940 | skb1); | |
941 | struct sk_buff *n; | |
942 | ||
943 | if (skb_orphan_frags(skb, gfp_mask)) | |
944 | return NULL; | |
945 | ||
946 | if (skb->fclone == SKB_FCLONE_ORIG && | |
947 | atomic_read(&fclones->fclone_ref) == 1) { | |
948 | n = &fclones->skb2; | |
949 | atomic_set(&fclones->fclone_ref, 2); | |
950 | } else { | |
951 | if (skb_pfmemalloc(skb)) | |
952 | gfp_mask |= __GFP_MEMALLOC; | |
953 | ||
954 | n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); | |
955 | if (!n) | |
956 | return NULL; | |
957 | ||
958 | kmemcheck_annotate_bitfield(n, flags1); | |
959 | n->fclone = SKB_FCLONE_UNAVAILABLE; | |
960 | } | |
961 | ||
962 | return __skb_clone(n, skb); | |
963 | } | |
964 | EXPORT_SYMBOL(skb_clone); | |
965 | ||
966 | static void skb_headers_offset_update(struct sk_buff *skb, int off) | |
967 | { | |
968 | /* Only adjust this if it actually is csum_start rather than csum */ | |
969 | if (skb->ip_summed == CHECKSUM_PARTIAL) | |
970 | skb->csum_start += off; | |
971 | /* {transport,network,mac}_header and tail are relative to skb->head */ | |
972 | skb->transport_header += off; | |
973 | skb->network_header += off; | |
974 | if (skb_mac_header_was_set(skb)) | |
975 | skb->mac_header += off; | |
976 | skb->inner_transport_header += off; | |
977 | skb->inner_network_header += off; | |
978 | skb->inner_mac_header += off; | |
979 | } | |
980 | ||
981 | static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |
982 | { | |
983 | __copy_skb_header(new, old); | |
984 | ||
985 | skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; | |
986 | skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; | |
987 | skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; | |
988 | } | |
989 | ||
990 | static inline int skb_alloc_rx_flag(const struct sk_buff *skb) | |
991 | { | |
992 | if (skb_pfmemalloc(skb)) | |
993 | return SKB_ALLOC_RX; | |
994 | return 0; | |
995 | } | |
996 | ||
997 | /** | |
998 | * skb_copy - create private copy of an sk_buff | |
999 | * @skb: buffer to copy | |
1000 | * @gfp_mask: allocation priority | |
1001 | * | |
1002 | * Make a copy of both an &sk_buff and its data. This is used when the | |
1003 | * caller wishes to modify the data and needs a private copy of the | |
1004 | * data to alter. Returns %NULL on failure or the pointer to the buffer | |
1005 | * on success. The returned buffer has a reference count of 1. | |
1006 | * | |
1007 | * As by-product this function converts non-linear &sk_buff to linear | |
1008 | * one, so that &sk_buff becomes completely private and caller is allowed | |
1009 | * to modify all the data of returned buffer. This means that this | |
1010 | * function is not recommended for use in circumstances when only | |
1011 | * header is going to be modified. Use pskb_copy() instead. | |
1012 | */ | |
1013 | ||
1014 | struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) | |
1015 | { | |
1016 | int headerlen = skb_headroom(skb); | |
1017 | unsigned int size = skb_end_offset(skb) + skb->data_len; | |
1018 | struct sk_buff *n = __alloc_skb(size, gfp_mask, | |
1019 | skb_alloc_rx_flag(skb), NUMA_NO_NODE); | |
1020 | ||
1021 | if (!n) | |
1022 | return NULL; | |
1023 | ||
1024 | /* Set the data pointer */ | |
1025 | skb_reserve(n, headerlen); | |
1026 | /* Set the tail pointer and length */ | |
1027 | skb_put(n, skb->len); | |
1028 | ||
1029 | if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) | |
1030 | BUG(); | |
1031 | ||
1032 | copy_skb_header(n, skb); | |
1033 | return n; | |
1034 | } | |
1035 | EXPORT_SYMBOL(skb_copy); | |
1036 | ||
1037 | /** | |
1038 | * __pskb_copy_fclone - create copy of an sk_buff with private head. | |
1039 | * @skb: buffer to copy | |
1040 | * @headroom: headroom of new skb | |
1041 | * @gfp_mask: allocation priority | |
1042 | * @fclone: if true allocate the copy of the skb from the fclone | |
1043 | * cache instead of the head cache; it is recommended to set this | |
1044 | * to true for the cases where the copy will likely be cloned | |
1045 | * | |
1046 | * Make a copy of both an &sk_buff and part of its data, located | |
1047 | * in header. Fragmented data remain shared. This is used when | |
1048 | * the caller wishes to modify only header of &sk_buff and needs | |
1049 | * private copy of the header to alter. Returns %NULL on failure | |
1050 | * or the pointer to the buffer on success. | |
1051 | * The returned buffer has a reference count of 1. | |
1052 | */ | |
1053 | ||
1054 | struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, | |
1055 | gfp_t gfp_mask, bool fclone) | |
1056 | { | |
1057 | unsigned int size = skb_headlen(skb) + headroom; | |
1058 | int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); | |
1059 | struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); | |
1060 | ||
1061 | if (!n) | |
1062 | goto out; | |
1063 | ||
1064 | /* Set the data pointer */ | |
1065 | skb_reserve(n, headroom); | |
1066 | /* Set the tail pointer and length */ | |
1067 | skb_put(n, skb_headlen(skb)); | |
1068 | /* Copy the bytes */ | |
1069 | skb_copy_from_linear_data(skb, n->data, n->len); | |
1070 | ||
1071 | n->truesize += skb->data_len; | |
1072 | n->data_len = skb->data_len; | |
1073 | n->len = skb->len; | |
1074 | ||
1075 | if (skb_shinfo(skb)->nr_frags) { | |
1076 | int i; | |
1077 | ||
1078 | if (skb_orphan_frags(skb, gfp_mask)) { | |
1079 | kfree_skb(n); | |
1080 | n = NULL; | |
1081 | goto out; | |
1082 | } | |
1083 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
1084 | skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; | |
1085 | skb_frag_ref(skb, i); | |
1086 | } | |
1087 | skb_shinfo(n)->nr_frags = i; | |
1088 | } | |
1089 | ||
1090 | if (skb_has_frag_list(skb)) { | |
1091 | skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; | |
1092 | skb_clone_fraglist(n); | |
1093 | } | |
1094 | ||
1095 | copy_skb_header(n, skb); | |
1096 | out: | |
1097 | return n; | |
1098 | } | |
1099 | EXPORT_SYMBOL(__pskb_copy_fclone); | |
1100 | ||
1101 | /** | |
1102 | * pskb_expand_head - reallocate header of &sk_buff | |
1103 | * @skb: buffer to reallocate | |
1104 | * @nhead: room to add at head | |
1105 | * @ntail: room to add at tail | |
1106 | * @gfp_mask: allocation priority | |
1107 | * | |
1108 | * Expands (or creates identical copy, if @nhead and @ntail are zero) | |
1109 | * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have | |
1110 | * reference count of 1. Returns zero in the case of success or error, | |
1111 | * if expansion failed. In the last case, &sk_buff is not changed. | |
1112 | * | |
1113 | * All the pointers pointing into skb header may change and must be | |
1114 | * reloaded after call to this function. | |
1115 | */ | |
1116 | ||
1117 | int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, | |
1118 | gfp_t gfp_mask) | |
1119 | { | |
1120 | int i; | |
1121 | u8 *data; | |
1122 | int size = nhead + skb_end_offset(skb) + ntail; | |
1123 | long off; | |
1124 | ||
1125 | BUG_ON(nhead < 0); | |
1126 | ||
1127 | if (skb_shared(skb)) | |
1128 | BUG(); | |
1129 | ||
1130 | size = SKB_DATA_ALIGN(size); | |
1131 | ||
1132 | if (skb_pfmemalloc(skb)) | |
1133 | gfp_mask |= __GFP_MEMALLOC; | |
1134 | data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), | |
1135 | gfp_mask, NUMA_NO_NODE, NULL); | |
1136 | if (!data) | |
1137 | goto nodata; | |
1138 | size = SKB_WITH_OVERHEAD(ksize(data)); | |
1139 | ||
1140 | /* Copy only real data... and, alas, header. This should be | |
1141 | * optimized for the cases when header is void. | |
1142 | */ | |
1143 | memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); | |
1144 | ||
1145 | memcpy((struct skb_shared_info *)(data + size), | |
1146 | skb_shinfo(skb), | |
1147 | offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); | |
1148 | ||
1149 | /* | |
1150 | * if shinfo is shared we must drop the old head gracefully, but if it | |
1151 | * is not we can just drop the old head and let the existing refcount | |
1152 | * be since all we did is relocate the values | |
1153 | */ | |
1154 | if (skb_cloned(skb)) { | |
1155 | /* copy this zero copy skb frags */ | |
1156 | if (skb_orphan_frags(skb, gfp_mask)) | |
1157 | goto nofrags; | |
1158 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) | |
1159 | skb_frag_ref(skb, i); | |
1160 | ||
1161 | if (skb_has_frag_list(skb)) | |
1162 | skb_clone_fraglist(skb); | |
1163 | ||
1164 | skb_release_data(skb); | |
1165 | } else { | |
1166 | skb_free_head(skb); | |
1167 | } | |
1168 | off = (data + nhead) - skb->head; | |
1169 | ||
1170 | skb->head = data; | |
1171 | skb->head_frag = 0; | |
1172 | skb->data += off; | |
1173 | #ifdef NET_SKBUFF_DATA_USES_OFFSET | |
1174 | skb->end = size; | |
1175 | off = nhead; | |
1176 | #else | |
1177 | skb->end = skb->head + size; | |
1178 | #endif | |
1179 | skb->tail += off; | |
1180 | skb_headers_offset_update(skb, nhead); | |
1181 | skb->cloned = 0; | |
1182 | skb->hdr_len = 0; | |
1183 | skb->nohdr = 0; | |
1184 | atomic_set(&skb_shinfo(skb)->dataref, 1); | |
1185 | return 0; | |
1186 | ||
1187 | nofrags: | |
1188 | kfree(data); | |
1189 | nodata: | |
1190 | return -ENOMEM; | |
1191 | } | |
1192 | EXPORT_SYMBOL(pskb_expand_head); | |
1193 | ||
1194 | /* Make private copy of skb with writable head and some headroom */ | |
1195 | ||
1196 | struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) | |
1197 | { | |
1198 | struct sk_buff *skb2; | |
1199 | int delta = headroom - skb_headroom(skb); | |
1200 | ||
1201 | if (delta <= 0) | |
1202 | skb2 = pskb_copy(skb, GFP_ATOMIC); | |
1203 | else { | |
1204 | skb2 = skb_clone(skb, GFP_ATOMIC); | |
1205 | if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, | |
1206 | GFP_ATOMIC)) { | |
1207 | kfree_skb(skb2); | |
1208 | skb2 = NULL; | |
1209 | } | |
1210 | } | |
1211 | return skb2; | |
1212 | } | |
1213 | EXPORT_SYMBOL(skb_realloc_headroom); | |
1214 | ||
1215 | /** | |
1216 | * skb_copy_expand - copy and expand sk_buff | |
1217 | * @skb: buffer to copy | |
1218 | * @newheadroom: new free bytes at head | |
1219 | * @newtailroom: new free bytes at tail | |
1220 | * @gfp_mask: allocation priority | |
1221 | * | |
1222 | * Make a copy of both an &sk_buff and its data and while doing so | |
1223 | * allocate additional space. | |
1224 | * | |
1225 | * This is used when the caller wishes to modify the data and needs a | |
1226 | * private copy of the data to alter as well as more space for new fields. | |
1227 | * Returns %NULL on failure or the pointer to the buffer | |
1228 | * on success. The returned buffer has a reference count of 1. | |
1229 | * | |
1230 | * You must pass %GFP_ATOMIC as the allocation priority if this function | |
1231 | * is called from an interrupt. | |
1232 | */ | |
1233 | struct sk_buff *skb_copy_expand(const struct sk_buff *skb, | |
1234 | int newheadroom, int newtailroom, | |
1235 | gfp_t gfp_mask) | |
1236 | { | |
1237 | /* | |
1238 | * Allocate the copy buffer | |
1239 | */ | |
1240 | struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, | |
1241 | gfp_mask, skb_alloc_rx_flag(skb), | |
1242 | NUMA_NO_NODE); | |
1243 | int oldheadroom = skb_headroom(skb); | |
1244 | int head_copy_len, head_copy_off; | |
1245 | ||
1246 | if (!n) | |
1247 | return NULL; | |
1248 | ||
1249 | skb_reserve(n, newheadroom); | |
1250 | ||
1251 | /* Set the tail pointer and length */ | |
1252 | skb_put(n, skb->len); | |
1253 | ||
1254 | head_copy_len = oldheadroom; | |
1255 | head_copy_off = 0; | |
1256 | if (newheadroom <= head_copy_len) | |
1257 | head_copy_len = newheadroom; | |
1258 | else | |
1259 | head_copy_off = newheadroom - head_copy_len; | |
1260 | ||
1261 | /* Copy the linear header and data. */ | |
1262 | if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, | |
1263 | skb->len + head_copy_len)) | |
1264 | BUG(); | |
1265 | ||
1266 | copy_skb_header(n, skb); | |
1267 | ||
1268 | skb_headers_offset_update(n, newheadroom - oldheadroom); | |
1269 | ||
1270 | return n; | |
1271 | } | |
1272 | EXPORT_SYMBOL(skb_copy_expand); | |
1273 | ||
1274 | /** | |
1275 | * skb_pad - zero pad the tail of an skb | |
1276 | * @skb: buffer to pad | |
1277 | * @pad: space to pad | |
1278 | * | |
1279 | * Ensure that a buffer is followed by a padding area that is zero | |
1280 | * filled. Used by network drivers which may DMA or transfer data | |
1281 | * beyond the buffer end onto the wire. | |
1282 | * | |
1283 | * May return error in out of memory cases. The skb is freed on error. | |
1284 | */ | |
1285 | ||
1286 | int skb_pad(struct sk_buff *skb, int pad) | |
1287 | { | |
1288 | int err; | |
1289 | int ntail; | |
1290 | ||
1291 | /* If the skbuff is non linear tailroom is always zero.. */ | |
1292 | if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { | |
1293 | memset(skb->data+skb->len, 0, pad); | |
1294 | return 0; | |
1295 | } | |
1296 | ||
1297 | ntail = skb->data_len + pad - (skb->end - skb->tail); | |
1298 | if (likely(skb_cloned(skb) || ntail > 0)) { | |
1299 | err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); | |
1300 | if (unlikely(err)) | |
1301 | goto free_skb; | |
1302 | } | |
1303 | ||
1304 | /* FIXME: The use of this function with non-linear skb's really needs | |
1305 | * to be audited. | |
1306 | */ | |
1307 | err = skb_linearize(skb); | |
1308 | if (unlikely(err)) | |
1309 | goto free_skb; | |
1310 | ||
1311 | memset(skb->data + skb->len, 0, pad); | |
1312 | return 0; | |
1313 | ||
1314 | free_skb: | |
1315 | kfree_skb(skb); | |
1316 | return err; | |
1317 | } | |
1318 | EXPORT_SYMBOL(skb_pad); | |
1319 | ||
1320 | /** | |
1321 | * pskb_put - add data to the tail of a potentially fragmented buffer | |
1322 | * @skb: start of the buffer to use | |
1323 | * @tail: tail fragment of the buffer to use | |
1324 | * @len: amount of data to add | |
1325 | * | |
1326 | * This function extends the used data area of the potentially | |
1327 | * fragmented buffer. @tail must be the last fragment of @skb -- or | |
1328 | * @skb itself. If this would exceed the total buffer size the kernel | |
1329 | * will panic. A pointer to the first byte of the extra data is | |
1330 | * returned. | |
1331 | */ | |
1332 | ||
1333 | unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) | |
1334 | { | |
1335 | if (tail != skb) { | |
1336 | skb->data_len += len; | |
1337 | skb->len += len; | |
1338 | } | |
1339 | return skb_put(tail, len); | |
1340 | } | |
1341 | EXPORT_SYMBOL_GPL(pskb_put); | |
1342 | ||
1343 | /** | |
1344 | * skb_put - add data to a buffer | |
1345 | * @skb: buffer to use | |
1346 | * @len: amount of data to add | |
1347 | * | |
1348 | * This function extends the used data area of the buffer. If this would | |
1349 | * exceed the total buffer size the kernel will panic. A pointer to the | |
1350 | * first byte of the extra data is returned. | |
1351 | */ | |
1352 | unsigned char *skb_put(struct sk_buff *skb, unsigned int len) | |
1353 | { | |
1354 | unsigned char *tmp = skb_tail_pointer(skb); | |
1355 | SKB_LINEAR_ASSERT(skb); | |
1356 | skb->tail += len; | |
1357 | skb->len += len; | |
1358 | if (unlikely(skb->tail > skb->end)) | |
1359 | skb_over_panic(skb, len, __builtin_return_address(0)); | |
1360 | return tmp; | |
1361 | } | |
1362 | EXPORT_SYMBOL(skb_put); | |
1363 | ||
1364 | /** | |
1365 | * skb_push - add data to the start of a buffer | |
1366 | * @skb: buffer to use | |
1367 | * @len: amount of data to add | |
1368 | * | |
1369 | * This function extends the used data area of the buffer at the buffer | |
1370 | * start. If this would exceed the total buffer headroom the kernel will | |
1371 | * panic. A pointer to the first byte of the extra data is returned. | |
1372 | */ | |
1373 | unsigned char *skb_push(struct sk_buff *skb, unsigned int len) | |
1374 | { | |
1375 | skb->data -= len; | |
1376 | skb->len += len; | |
1377 | if (unlikely(skb->data<skb->head)) | |
1378 | skb_under_panic(skb, len, __builtin_return_address(0)); | |
1379 | return skb->data; | |
1380 | } | |
1381 | EXPORT_SYMBOL(skb_push); | |
1382 | ||
1383 | /** | |
1384 | * skb_pull - remove data from the start of a buffer | |
1385 | * @skb: buffer to use | |
1386 | * @len: amount of data to remove | |
1387 | * | |
1388 | * This function removes data from the start of a buffer, returning | |
1389 | * the memory to the headroom. A pointer to the next data in the buffer | |
1390 | * is returned. Once the data has been pulled future pushes will overwrite | |
1391 | * the old data. | |
1392 | */ | |
1393 | unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) | |
1394 | { | |
1395 | return skb_pull_inline(skb, len); | |
1396 | } | |
1397 | EXPORT_SYMBOL(skb_pull); | |
1398 | ||
1399 | /** | |
1400 | * skb_trim - remove end from a buffer | |
1401 | * @skb: buffer to alter | |
1402 | * @len: new length | |
1403 | * | |
1404 | * Cut the length of a buffer down by removing data from the tail. If | |
1405 | * the buffer is already under the length specified it is not modified. | |
1406 | * The skb must be linear. | |
1407 | */ | |
1408 | void skb_trim(struct sk_buff *skb, unsigned int len) | |
1409 | { | |
1410 | if (skb->len > len) | |
1411 | __skb_trim(skb, len); | |
1412 | } | |
1413 | EXPORT_SYMBOL(skb_trim); | |
1414 | ||
1415 | /* Trims skb to length len. It can change skb pointers. | |
1416 | */ | |
1417 | ||
1418 | int ___pskb_trim(struct sk_buff *skb, unsigned int len) | |
1419 | { | |
1420 | struct sk_buff **fragp; | |
1421 | struct sk_buff *frag; | |
1422 | int offset = skb_headlen(skb); | |
1423 | int nfrags = skb_shinfo(skb)->nr_frags; | |
1424 | int i; | |
1425 | int err; | |
1426 | ||
1427 | if (skb_cloned(skb) && | |
1428 | unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) | |
1429 | return err; | |
1430 | ||
1431 | i = 0; | |
1432 | if (offset >= len) | |
1433 | goto drop_pages; | |
1434 | ||
1435 | for (; i < nfrags; i++) { | |
1436 | int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); | |
1437 | ||
1438 | if (end < len) { | |
1439 | offset = end; | |
1440 | continue; | |
1441 | } | |
1442 | ||
1443 | skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); | |
1444 | ||
1445 | drop_pages: | |
1446 | skb_shinfo(skb)->nr_frags = i; | |
1447 | ||
1448 | for (; i < nfrags; i++) | |
1449 | skb_frag_unref(skb, i); | |
1450 | ||
1451 | if (skb_has_frag_list(skb)) | |
1452 | skb_drop_fraglist(skb); | |
1453 | goto done; | |
1454 | } | |
1455 | ||
1456 | for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); | |
1457 | fragp = &frag->next) { | |
1458 | int end = offset + frag->len; | |
1459 | ||
1460 | if (skb_shared(frag)) { | |
1461 | struct sk_buff *nfrag; | |
1462 | ||
1463 | nfrag = skb_clone(frag, GFP_ATOMIC); | |
1464 | if (unlikely(!nfrag)) | |
1465 | return -ENOMEM; | |
1466 | ||
1467 | nfrag->next = frag->next; | |
1468 | consume_skb(frag); | |
1469 | frag = nfrag; | |
1470 | *fragp = frag; | |
1471 | } | |
1472 | ||
1473 | if (end < len) { | |
1474 | offset = end; | |
1475 | continue; | |
1476 | } | |
1477 | ||
1478 | if (end > len && | |
1479 | unlikely((err = pskb_trim(frag, len - offset)))) | |
1480 | return err; | |
1481 | ||
1482 | if (frag->next) | |
1483 | skb_drop_list(&frag->next); | |
1484 | break; | |
1485 | } | |
1486 | ||
1487 | done: | |
1488 | if (len > skb_headlen(skb)) { | |
1489 | skb->data_len -= skb->len - len; | |
1490 | skb->len = len; | |
1491 | } else { | |
1492 | skb->len = len; | |
1493 | skb->data_len = 0; | |
1494 | skb_set_tail_pointer(skb, len); | |
1495 | } | |
1496 | ||
1497 | return 0; | |
1498 | } | |
1499 | EXPORT_SYMBOL(___pskb_trim); | |
1500 | ||
1501 | /** | |
1502 | * __pskb_pull_tail - advance tail of skb header | |
1503 | * @skb: buffer to reallocate | |
1504 | * @delta: number of bytes to advance tail | |
1505 | * | |
1506 | * The function makes a sense only on a fragmented &sk_buff, | |
1507 | * it expands header moving its tail forward and copying necessary | |
1508 | * data from fragmented part. | |
1509 | * | |
1510 | * &sk_buff MUST have reference count of 1. | |
1511 | * | |
1512 | * Returns %NULL (and &sk_buff does not change) if pull failed | |
1513 | * or value of new tail of skb in the case of success. | |
1514 | * | |
1515 | * All the pointers pointing into skb header may change and must be | |
1516 | * reloaded after call to this function. | |
1517 | */ | |
1518 | ||
1519 | /* Moves tail of skb head forward, copying data from fragmented part, | |
1520 | * when it is necessary. | |
1521 | * 1. It may fail due to malloc failure. | |
1522 | * 2. It may change skb pointers. | |
1523 | * | |
1524 | * It is pretty complicated. Luckily, it is called only in exceptional cases. | |
1525 | */ | |
1526 | unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) | |
1527 | { | |
1528 | /* If skb has not enough free space at tail, get new one | |
1529 | * plus 128 bytes for future expansions. If we have enough | |
1530 | * room at tail, reallocate without expansion only if skb is cloned. | |
1531 | */ | |
1532 | int i, k, eat = (skb->tail + delta) - skb->end; | |
1533 | ||
1534 | if (eat > 0 || skb_cloned(skb)) { | |
1535 | if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, | |
1536 | GFP_ATOMIC)) | |
1537 | return NULL; | |
1538 | } | |
1539 | ||
1540 | if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) | |
1541 | BUG(); | |
1542 | ||
1543 | /* Optimization: no fragments, no reasons to preestimate | |
1544 | * size of pulled pages. Superb. | |
1545 | */ | |
1546 | if (!skb_has_frag_list(skb)) | |
1547 | goto pull_pages; | |
1548 | ||
1549 | /* Estimate size of pulled pages. */ | |
1550 | eat = delta; | |
1551 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
1552 | int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); | |
1553 | ||
1554 | if (size >= eat) | |
1555 | goto pull_pages; | |
1556 | eat -= size; | |
1557 | } | |
1558 | ||
1559 | /* If we need update frag list, we are in troubles. | |
1560 | * Certainly, it possible to add an offset to skb data, | |
1561 | * but taking into account that pulling is expected to | |
1562 | * be very rare operation, it is worth to fight against | |
1563 | * further bloating skb head and crucify ourselves here instead. | |
1564 | * Pure masohism, indeed. 8)8) | |
1565 | */ | |
1566 | if (eat) { | |
1567 | struct sk_buff *list = skb_shinfo(skb)->frag_list; | |
1568 | struct sk_buff *clone = NULL; | |
1569 | struct sk_buff *insp = NULL; | |
1570 | ||
1571 | do { | |
1572 | BUG_ON(!list); | |
1573 | ||
1574 | if (list->len <= eat) { | |
1575 | /* Eaten as whole. */ | |
1576 | eat -= list->len; | |
1577 | list = list->next; | |
1578 | insp = list; | |
1579 | } else { | |
1580 | /* Eaten partially. */ | |
1581 | ||
1582 | if (skb_shared(list)) { | |
1583 | /* Sucks! We need to fork list. :-( */ | |
1584 | clone = skb_clone(list, GFP_ATOMIC); | |
1585 | if (!clone) | |
1586 | return NULL; | |
1587 | insp = list->next; | |
1588 | list = clone; | |
1589 | } else { | |
1590 | /* This may be pulled without | |
1591 | * problems. */ | |
1592 | insp = list; | |
1593 | } | |
1594 | if (!pskb_pull(list, eat)) { | |
1595 | kfree_skb(clone); | |
1596 | return NULL; | |
1597 | } | |
1598 | break; | |
1599 | } | |
1600 | } while (eat); | |
1601 | ||
1602 | /* Free pulled out fragments. */ | |
1603 | while ((list = skb_shinfo(skb)->frag_list) != insp) { | |
1604 | skb_shinfo(skb)->frag_list = list->next; | |
1605 | kfree_skb(list); | |
1606 | } | |
1607 | /* And insert new clone at head. */ | |
1608 | if (clone) { | |
1609 | clone->next = list; | |
1610 | skb_shinfo(skb)->frag_list = clone; | |
1611 | } | |
1612 | } | |
1613 | /* Success! Now we may commit changes to skb data. */ | |
1614 | ||
1615 | pull_pages: | |
1616 | eat = delta; | |
1617 | k = 0; | |
1618 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
1619 | int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); | |
1620 | ||
1621 | if (size <= eat) { | |
1622 | skb_frag_unref(skb, i); | |
1623 | eat -= size; | |
1624 | } else { | |
1625 | skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; | |
1626 | if (eat) { | |
1627 | skb_shinfo(skb)->frags[k].page_offset += eat; | |
1628 | skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); | |
1629 | eat = 0; | |
1630 | } | |
1631 | k++; | |
1632 | } | |
1633 | } | |
1634 | skb_shinfo(skb)->nr_frags = k; | |
1635 | ||
1636 | skb->tail += delta; | |
1637 | skb->data_len -= delta; | |
1638 | ||
1639 | return skb_tail_pointer(skb); | |
1640 | } | |
1641 | EXPORT_SYMBOL(__pskb_pull_tail); | |
1642 | ||
1643 | /** | |
1644 | * skb_copy_bits - copy bits from skb to kernel buffer | |
1645 | * @skb: source skb | |
1646 | * @offset: offset in source | |
1647 | * @to: destination buffer | |
1648 | * @len: number of bytes to copy | |
1649 | * | |
1650 | * Copy the specified number of bytes from the source skb to the | |
1651 | * destination buffer. | |
1652 | * | |
1653 | * CAUTION ! : | |
1654 | * If its prototype is ever changed, | |
1655 | * check arch/{*}/net/{*}.S files, | |
1656 | * since it is called from BPF assembly code. | |
1657 | */ | |
1658 | int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) | |
1659 | { | |
1660 | int start = skb_headlen(skb); | |
1661 | struct sk_buff *frag_iter; | |
1662 | int i, copy; | |
1663 | ||
1664 | if (offset > (int)skb->len - len) | |
1665 | goto fault; | |
1666 | ||
1667 | /* Copy header. */ | |
1668 | if ((copy = start - offset) > 0) { | |
1669 | if (copy > len) | |
1670 | copy = len; | |
1671 | skb_copy_from_linear_data_offset(skb, offset, to, copy); | |
1672 | if ((len -= copy) == 0) | |
1673 | return 0; | |
1674 | offset += copy; | |
1675 | to += copy; | |
1676 | } | |
1677 | ||
1678 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
1679 | int end; | |
1680 | skb_frag_t *f = &skb_shinfo(skb)->frags[i]; | |
1681 | ||
1682 | WARN_ON(start > offset + len); | |
1683 | ||
1684 | end = start + skb_frag_size(f); | |
1685 | if ((copy = end - offset) > 0) { | |
1686 | u8 *vaddr; | |
1687 | ||
1688 | if (copy > len) | |
1689 | copy = len; | |
1690 | ||
1691 | vaddr = kmap_atomic(skb_frag_page(f)); | |
1692 | memcpy(to, | |
1693 | vaddr + f->page_offset + offset - start, | |
1694 | copy); | |
1695 | kunmap_atomic(vaddr); | |
1696 | ||
1697 | if ((len -= copy) == 0) | |
1698 | return 0; | |
1699 | offset += copy; | |
1700 | to += copy; | |
1701 | } | |
1702 | start = end; | |
1703 | } | |
1704 | ||
1705 | skb_walk_frags(skb, frag_iter) { | |
1706 | int end; | |
1707 | ||
1708 | WARN_ON(start > offset + len); | |
1709 | ||
1710 | end = start + frag_iter->len; | |
1711 | if ((copy = end - offset) > 0) { | |
1712 | if (copy > len) | |
1713 | copy = len; | |
1714 | if (skb_copy_bits(frag_iter, offset - start, to, copy)) | |
1715 | goto fault; | |
1716 | if ((len -= copy) == 0) | |
1717 | return 0; | |
1718 | offset += copy; | |
1719 | to += copy; | |
1720 | } | |
1721 | start = end; | |
1722 | } | |
1723 | ||
1724 | if (!len) | |
1725 | return 0; | |
1726 | ||
1727 | fault: | |
1728 | return -EFAULT; | |
1729 | } | |
1730 | EXPORT_SYMBOL(skb_copy_bits); | |
1731 | ||
1732 | /* | |
1733 | * Callback from splice_to_pipe(), if we need to release some pages | |
1734 | * at the end of the spd in case we error'ed out in filling the pipe. | |
1735 | */ | |
1736 | static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) | |
1737 | { | |
1738 | put_page(spd->pages[i]); | |
1739 | } | |
1740 | ||
1741 | static struct page *linear_to_page(struct page *page, unsigned int *len, | |
1742 | unsigned int *offset, | |
1743 | struct sock *sk) | |
1744 | { | |
1745 | struct page_frag *pfrag = sk_page_frag(sk); | |
1746 | ||
1747 | if (!sk_page_frag_refill(sk, pfrag)) | |
1748 | return NULL; | |
1749 | ||
1750 | *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); | |
1751 | ||
1752 | memcpy(page_address(pfrag->page) + pfrag->offset, | |
1753 | page_address(page) + *offset, *len); | |
1754 | *offset = pfrag->offset; | |
1755 | pfrag->offset += *len; | |
1756 | ||
1757 | return pfrag->page; | |
1758 | } | |
1759 | ||
1760 | static bool spd_can_coalesce(const struct splice_pipe_desc *spd, | |
1761 | struct page *page, | |
1762 | unsigned int offset) | |
1763 | { | |
1764 | return spd->nr_pages && | |
1765 | spd->pages[spd->nr_pages - 1] == page && | |
1766 | (spd->partial[spd->nr_pages - 1].offset + | |
1767 | spd->partial[spd->nr_pages - 1].len == offset); | |
1768 | } | |
1769 | ||
1770 | /* | |
1771 | * Fill page/offset/length into spd, if it can hold more pages. | |
1772 | */ | |
1773 | static bool spd_fill_page(struct splice_pipe_desc *spd, | |
1774 | struct pipe_inode_info *pipe, struct page *page, | |
1775 | unsigned int *len, unsigned int offset, | |
1776 | bool linear, | |
1777 | struct sock *sk) | |
1778 | { | |
1779 | if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) | |
1780 | return true; | |
1781 | ||
1782 | if (linear) { | |
1783 | page = linear_to_page(page, len, &offset, sk); | |
1784 | if (!page) | |
1785 | return true; | |
1786 | } | |
1787 | if (spd_can_coalesce(spd, page, offset)) { | |
1788 | spd->partial[spd->nr_pages - 1].len += *len; | |
1789 | return false; | |
1790 | } | |
1791 | get_page(page); | |
1792 | spd->pages[spd->nr_pages] = page; | |
1793 | spd->partial[spd->nr_pages].len = *len; | |
1794 | spd->partial[spd->nr_pages].offset = offset; | |
1795 | spd->nr_pages++; | |
1796 | ||
1797 | return false; | |
1798 | } | |
1799 | ||
1800 | static bool __splice_segment(struct page *page, unsigned int poff, | |
1801 | unsigned int plen, unsigned int *off, | |
1802 | unsigned int *len, | |
1803 | struct splice_pipe_desc *spd, bool linear, | |
1804 | struct sock *sk, | |
1805 | struct pipe_inode_info *pipe) | |
1806 | { | |
1807 | if (!*len) | |
1808 | return true; | |
1809 | ||
1810 | /* skip this segment if already processed */ | |
1811 | if (*off >= plen) { | |
1812 | *off -= plen; | |
1813 | return false; | |
1814 | } | |
1815 | ||
1816 | /* ignore any bits we already processed */ | |
1817 | poff += *off; | |
1818 | plen -= *off; | |
1819 | *off = 0; | |
1820 | ||
1821 | do { | |
1822 | unsigned int flen = min(*len, plen); | |
1823 | ||
1824 | if (spd_fill_page(spd, pipe, page, &flen, poff, | |
1825 | linear, sk)) | |
1826 | return true; | |
1827 | poff += flen; | |
1828 | plen -= flen; | |
1829 | *len -= flen; | |
1830 | } while (*len && plen); | |
1831 | ||
1832 | return false; | |
1833 | } | |
1834 | ||
1835 | /* | |
1836 | * Map linear and fragment data from the skb to spd. It reports true if the | |
1837 | * pipe is full or if we already spliced the requested length. | |
1838 | */ | |
1839 | static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, | |
1840 | unsigned int *offset, unsigned int *len, | |
1841 | struct splice_pipe_desc *spd, struct sock *sk) | |
1842 | { | |
1843 | int seg; | |
1844 | ||
1845 | /* map the linear part : | |
1846 | * If skb->head_frag is set, this 'linear' part is backed by a | |
1847 | * fragment, and if the head is not shared with any clones then | |
1848 | * we can avoid a copy since we own the head portion of this page. | |
1849 | */ | |
1850 | if (__splice_segment(virt_to_page(skb->data), | |
1851 | (unsigned long) skb->data & (PAGE_SIZE - 1), | |
1852 | skb_headlen(skb), | |
1853 | offset, len, spd, | |
1854 | skb_head_is_locked(skb), | |
1855 | sk, pipe)) | |
1856 | return true; | |
1857 | ||
1858 | /* | |
1859 | * then map the fragments | |
1860 | */ | |
1861 | for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { | |
1862 | const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; | |
1863 | ||
1864 | if (__splice_segment(skb_frag_page(f), | |
1865 | f->page_offset, skb_frag_size(f), | |
1866 | offset, len, spd, false, sk, pipe)) | |
1867 | return true; | |
1868 | } | |
1869 | ||
1870 | return false; | |
1871 | } | |
1872 | ||
1873 | ssize_t skb_socket_splice(struct sock *sk, | |
1874 | struct pipe_inode_info *pipe, | |
1875 | struct splice_pipe_desc *spd) | |
1876 | { | |
1877 | int ret; | |
1878 | ||
1879 | /* Drop the socket lock, otherwise we have reverse | |
1880 | * locking dependencies between sk_lock and i_mutex | |
1881 | * here as compared to sendfile(). We enter here | |
1882 | * with the socket lock held, and splice_to_pipe() will | |
1883 | * grab the pipe inode lock. For sendfile() emulation, | |
1884 | * we call into ->sendpage() with the i_mutex lock held | |
1885 | * and networking will grab the socket lock. | |
1886 | */ | |
1887 | release_sock(sk); | |
1888 | ret = splice_to_pipe(pipe, spd); | |
1889 | lock_sock(sk); | |
1890 | ||
1891 | return ret; | |
1892 | } | |
1893 | ||
1894 | /* | |
1895 | * Map data from the skb to a pipe. Should handle both the linear part, | |
1896 | * the fragments, and the frag list. It does NOT handle frag lists within | |
1897 | * the frag list, if such a thing exists. We'd probably need to recurse to | |
1898 | * handle that cleanly. | |
1899 | */ | |
1900 | int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, | |
1901 | struct pipe_inode_info *pipe, unsigned int tlen, | |
1902 | unsigned int flags, | |
1903 | ssize_t (*splice_cb)(struct sock *, | |
1904 | struct pipe_inode_info *, | |
1905 | struct splice_pipe_desc *)) | |
1906 | { | |
1907 | struct partial_page partial[MAX_SKB_FRAGS]; | |
1908 | struct page *pages[MAX_SKB_FRAGS]; | |
1909 | struct splice_pipe_desc spd = { | |
1910 | .pages = pages, | |
1911 | .partial = partial, | |
1912 | .nr_pages_max = MAX_SKB_FRAGS, | |
1913 | .flags = flags, | |
1914 | .ops = &nosteal_pipe_buf_ops, | |
1915 | .spd_release = sock_spd_release, | |
1916 | }; | |
1917 | struct sk_buff *frag_iter; | |
1918 | int ret = 0; | |
1919 | ||
1920 | /* | |
1921 | * __skb_splice_bits() only fails if the output has no room left, | |
1922 | * so no point in going over the frag_list for the error case. | |
1923 | */ | |
1924 | if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) | |
1925 | goto done; | |
1926 | else if (!tlen) | |
1927 | goto done; | |
1928 | ||
1929 | /* | |
1930 | * now see if we have a frag_list to map | |
1931 | */ | |
1932 | skb_walk_frags(skb, frag_iter) { | |
1933 | if (!tlen) | |
1934 | break; | |
1935 | if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) | |
1936 | break; | |
1937 | } | |
1938 | ||
1939 | done: | |
1940 | if (spd.nr_pages) | |
1941 | ret = splice_cb(sk, pipe, &spd); | |
1942 | ||
1943 | return ret; | |
1944 | } | |
1945 | EXPORT_SYMBOL_GPL(skb_splice_bits); | |
1946 | ||
1947 | /** | |
1948 | * skb_store_bits - store bits from kernel buffer to skb | |
1949 | * @skb: destination buffer | |
1950 | * @offset: offset in destination | |
1951 | * @from: source buffer | |
1952 | * @len: number of bytes to copy | |
1953 | * | |
1954 | * Copy the specified number of bytes from the source buffer to the | |
1955 | * destination skb. This function handles all the messy bits of | |
1956 | * traversing fragment lists and such. | |
1957 | */ | |
1958 | ||
1959 | int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) | |
1960 | { | |
1961 | int start = skb_headlen(skb); | |
1962 | struct sk_buff *frag_iter; | |
1963 | int i, copy; | |
1964 | ||
1965 | if (offset > (int)skb->len - len) | |
1966 | goto fault; | |
1967 | ||
1968 | if ((copy = start - offset) > 0) { | |
1969 | if (copy > len) | |
1970 | copy = len; | |
1971 | skb_copy_to_linear_data_offset(skb, offset, from, copy); | |
1972 | if ((len -= copy) == 0) | |
1973 | return 0; | |
1974 | offset += copy; | |
1975 | from += copy; | |
1976 | } | |
1977 | ||
1978 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
1979 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | |
1980 | int end; | |
1981 | ||
1982 | WARN_ON(start > offset + len); | |
1983 | ||
1984 | end = start + skb_frag_size(frag); | |
1985 | if ((copy = end - offset) > 0) { | |
1986 | u8 *vaddr; | |
1987 | ||
1988 | if (copy > len) | |
1989 | copy = len; | |
1990 | ||
1991 | vaddr = kmap_atomic(skb_frag_page(frag)); | |
1992 | memcpy(vaddr + frag->page_offset + offset - start, | |
1993 | from, copy); | |
1994 | kunmap_atomic(vaddr); | |
1995 | ||
1996 | if ((len -= copy) == 0) | |
1997 | return 0; | |
1998 | offset += copy; | |
1999 | from += copy; | |
2000 | } | |
2001 | start = end; | |
2002 | } | |
2003 | ||
2004 | skb_walk_frags(skb, frag_iter) { | |
2005 | int end; | |
2006 | ||
2007 | WARN_ON(start > offset + len); | |
2008 | ||
2009 | end = start + frag_iter->len; | |
2010 | if ((copy = end - offset) > 0) { | |
2011 | if (copy > len) | |
2012 | copy = len; | |
2013 | if (skb_store_bits(frag_iter, offset - start, | |
2014 | from, copy)) | |
2015 | goto fault; | |
2016 | if ((len -= copy) == 0) | |
2017 | return 0; | |
2018 | offset += copy; | |
2019 | from += copy; | |
2020 | } | |
2021 | start = end; | |
2022 | } | |
2023 | if (!len) | |
2024 | return 0; | |
2025 | ||
2026 | fault: | |
2027 | return -EFAULT; | |
2028 | } | |
2029 | EXPORT_SYMBOL(skb_store_bits); | |
2030 | ||
2031 | /* Checksum skb data. */ | |
2032 | __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, | |
2033 | __wsum csum, const struct skb_checksum_ops *ops) | |
2034 | { | |
2035 | int start = skb_headlen(skb); | |
2036 | int i, copy = start - offset; | |
2037 | struct sk_buff *frag_iter; | |
2038 | int pos = 0; | |
2039 | ||
2040 | /* Checksum header. */ | |
2041 | if (copy > 0) { | |
2042 | if (copy > len) | |
2043 | copy = len; | |
2044 | csum = ops->update(skb->data + offset, copy, csum); | |
2045 | if ((len -= copy) == 0) | |
2046 | return csum; | |
2047 | offset += copy; | |
2048 | pos = copy; | |
2049 | } | |
2050 | ||
2051 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
2052 | int end; | |
2053 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | |
2054 | ||
2055 | WARN_ON(start > offset + len); | |
2056 | ||
2057 | end = start + skb_frag_size(frag); | |
2058 | if ((copy = end - offset) > 0) { | |
2059 | __wsum csum2; | |
2060 | u8 *vaddr; | |
2061 | ||
2062 | if (copy > len) | |
2063 | copy = len; | |
2064 | vaddr = kmap_atomic(skb_frag_page(frag)); | |
2065 | csum2 = ops->update(vaddr + frag->page_offset + | |
2066 | offset - start, copy, 0); | |
2067 | kunmap_atomic(vaddr); | |
2068 | csum = ops->combine(csum, csum2, pos, copy); | |
2069 | if (!(len -= copy)) | |
2070 | return csum; | |
2071 | offset += copy; | |
2072 | pos += copy; | |
2073 | } | |
2074 | start = end; | |
2075 | } | |
2076 | ||
2077 | skb_walk_frags(skb, frag_iter) { | |
2078 | int end; | |
2079 | ||
2080 | WARN_ON(start > offset + len); | |
2081 | ||
2082 | end = start + frag_iter->len; | |
2083 | if ((copy = end - offset) > 0) { | |
2084 | __wsum csum2; | |
2085 | if (copy > len) | |
2086 | copy = len; | |
2087 | csum2 = __skb_checksum(frag_iter, offset - start, | |
2088 | copy, 0, ops); | |
2089 | csum = ops->combine(csum, csum2, pos, copy); | |
2090 | if ((len -= copy) == 0) | |
2091 | return csum; | |
2092 | offset += copy; | |
2093 | pos += copy; | |
2094 | } | |
2095 | start = end; | |
2096 | } | |
2097 | BUG_ON(len); | |
2098 | ||
2099 | return csum; | |
2100 | } | |
2101 | EXPORT_SYMBOL(__skb_checksum); | |
2102 | ||
2103 | __wsum skb_checksum(const struct sk_buff *skb, int offset, | |
2104 | int len, __wsum csum) | |
2105 | { | |
2106 | const struct skb_checksum_ops ops = { | |
2107 | .update = csum_partial_ext, | |
2108 | .combine = csum_block_add_ext, | |
2109 | }; | |
2110 | ||
2111 | return __skb_checksum(skb, offset, len, csum, &ops); | |
2112 | } | |
2113 | EXPORT_SYMBOL(skb_checksum); | |
2114 | ||
2115 | /* Both of above in one bottle. */ | |
2116 | ||
2117 | __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, | |
2118 | u8 *to, int len, __wsum csum) | |
2119 | { | |
2120 | int start = skb_headlen(skb); | |
2121 | int i, copy = start - offset; | |
2122 | struct sk_buff *frag_iter; | |
2123 | int pos = 0; | |
2124 | ||
2125 | /* Copy header. */ | |
2126 | if (copy > 0) { | |
2127 | if (copy > len) | |
2128 | copy = len; | |
2129 | csum = csum_partial_copy_nocheck(skb->data + offset, to, | |
2130 | copy, csum); | |
2131 | if ((len -= copy) == 0) | |
2132 | return csum; | |
2133 | offset += copy; | |
2134 | to += copy; | |
2135 | pos = copy; | |
2136 | } | |
2137 | ||
2138 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
2139 | int end; | |
2140 | ||
2141 | WARN_ON(start > offset + len); | |
2142 | ||
2143 | end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); | |
2144 | if ((copy = end - offset) > 0) { | |
2145 | __wsum csum2; | |
2146 | u8 *vaddr; | |
2147 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | |
2148 | ||
2149 | if (copy > len) | |
2150 | copy = len; | |
2151 | vaddr = kmap_atomic(skb_frag_page(frag)); | |
2152 | csum2 = csum_partial_copy_nocheck(vaddr + | |
2153 | frag->page_offset + | |
2154 | offset - start, to, | |
2155 | copy, 0); | |
2156 | kunmap_atomic(vaddr); | |
2157 | csum = csum_block_add(csum, csum2, pos); | |
2158 | if (!(len -= copy)) | |
2159 | return csum; | |
2160 | offset += copy; | |
2161 | to += copy; | |
2162 | pos += copy; | |
2163 | } | |
2164 | start = end; | |
2165 | } | |
2166 | ||
2167 | skb_walk_frags(skb, frag_iter) { | |
2168 | __wsum csum2; | |
2169 | int end; | |
2170 | ||
2171 | WARN_ON(start > offset + len); | |
2172 | ||
2173 | end = start + frag_iter->len; | |
2174 | if ((copy = end - offset) > 0) { | |
2175 | if (copy > len) | |
2176 | copy = len; | |
2177 | csum2 = skb_copy_and_csum_bits(frag_iter, | |
2178 | offset - start, | |
2179 | to, copy, 0); | |
2180 | csum = csum_block_add(csum, csum2, pos); | |
2181 | if ((len -= copy) == 0) | |
2182 | return csum; | |
2183 | offset += copy; | |
2184 | to += copy; | |
2185 | pos += copy; | |
2186 | } | |
2187 | start = end; | |
2188 | } | |
2189 | BUG_ON(len); | |
2190 | return csum; | |
2191 | } | |
2192 | EXPORT_SYMBOL(skb_copy_and_csum_bits); | |
2193 | ||
2194 | /** | |
2195 | * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() | |
2196 | * @from: source buffer | |
2197 | * | |
2198 | * Calculates the amount of linear headroom needed in the 'to' skb passed | |
2199 | * into skb_zerocopy(). | |
2200 | */ | |
2201 | unsigned int | |
2202 | skb_zerocopy_headlen(const struct sk_buff *from) | |
2203 | { | |
2204 | unsigned int hlen = 0; | |
2205 | ||
2206 | if (!from->head_frag || | |
2207 | skb_headlen(from) < L1_CACHE_BYTES || | |
2208 | skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) | |
2209 | hlen = skb_headlen(from); | |
2210 | ||
2211 | if (skb_has_frag_list(from)) | |
2212 | hlen = from->len; | |
2213 | ||
2214 | return hlen; | |
2215 | } | |
2216 | EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); | |
2217 | ||
2218 | /** | |
2219 | * skb_zerocopy - Zero copy skb to skb | |
2220 | * @to: destination buffer | |
2221 | * @from: source buffer | |
2222 | * @len: number of bytes to copy from source buffer | |
2223 | * @hlen: size of linear headroom in destination buffer | |
2224 | * | |
2225 | * Copies up to `len` bytes from `from` to `to` by creating references | |
2226 | * to the frags in the source buffer. | |
2227 | * | |
2228 | * The `hlen` as calculated by skb_zerocopy_headlen() specifies the | |
2229 | * headroom in the `to` buffer. | |
2230 | * | |
2231 | * Return value: | |
2232 | * 0: everything is OK | |
2233 | * -ENOMEM: couldn't orphan frags of @from due to lack of memory | |
2234 | * -EFAULT: skb_copy_bits() found some problem with skb geometry | |
2235 | */ | |
2236 | int | |
2237 | skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) | |
2238 | { | |
2239 | int i, j = 0; | |
2240 | int plen = 0; /* length of skb->head fragment */ | |
2241 | int ret; | |
2242 | struct page *page; | |
2243 | unsigned int offset; | |
2244 | ||
2245 | BUG_ON(!from->head_frag && !hlen); | |
2246 | ||
2247 | /* dont bother with small payloads */ | |
2248 | if (len <= skb_tailroom(to)) | |
2249 | return skb_copy_bits(from, 0, skb_put(to, len), len); | |
2250 | ||
2251 | if (hlen) { | |
2252 | ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); | |
2253 | if (unlikely(ret)) | |
2254 | return ret; | |
2255 | len -= hlen; | |
2256 | } else { | |
2257 | plen = min_t(int, skb_headlen(from), len); | |
2258 | if (plen) { | |
2259 | page = virt_to_head_page(from->head); | |
2260 | offset = from->data - (unsigned char *)page_address(page); | |
2261 | __skb_fill_page_desc(to, 0, page, offset, plen); | |
2262 | get_page(page); | |
2263 | j = 1; | |
2264 | len -= plen; | |
2265 | } | |
2266 | } | |
2267 | ||
2268 | to->truesize += len + plen; | |
2269 | to->len += len + plen; | |
2270 | to->data_len += len + plen; | |
2271 | ||
2272 | if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { | |
2273 | skb_tx_error(from); | |
2274 | return -ENOMEM; | |
2275 | } | |
2276 | ||
2277 | for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { | |
2278 | if (!len) | |
2279 | break; | |
2280 | skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; | |
2281 | skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); | |
2282 | len -= skb_shinfo(to)->frags[j].size; | |
2283 | skb_frag_ref(to, j); | |
2284 | j++; | |
2285 | } | |
2286 | skb_shinfo(to)->nr_frags = j; | |
2287 | ||
2288 | return 0; | |
2289 | } | |
2290 | EXPORT_SYMBOL_GPL(skb_zerocopy); | |
2291 | ||
2292 | void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) | |
2293 | { | |
2294 | __wsum csum; | |
2295 | long csstart; | |
2296 | ||
2297 | if (skb->ip_summed == CHECKSUM_PARTIAL) | |
2298 | csstart = skb_checksum_start_offset(skb); | |
2299 | else | |
2300 | csstart = skb_headlen(skb); | |
2301 | ||
2302 | BUG_ON(csstart > skb_headlen(skb)); | |
2303 | ||
2304 | skb_copy_from_linear_data(skb, to, csstart); | |
2305 | ||
2306 | csum = 0; | |
2307 | if (csstart != skb->len) | |
2308 | csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, | |
2309 | skb->len - csstart, 0); | |
2310 | ||
2311 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | |
2312 | long csstuff = csstart + skb->csum_offset; | |
2313 | ||
2314 | *((__sum16 *)(to + csstuff)) = csum_fold(csum); | |
2315 | } | |
2316 | } | |
2317 | EXPORT_SYMBOL(skb_copy_and_csum_dev); | |
2318 | ||
2319 | /** | |
2320 | * skb_dequeue - remove from the head of the queue | |
2321 | * @list: list to dequeue from | |
2322 | * | |
2323 | * Remove the head of the list. The list lock is taken so the function | |
2324 | * may be used safely with other locking list functions. The head item is | |
2325 | * returned or %NULL if the list is empty. | |
2326 | */ | |
2327 | ||
2328 | struct sk_buff *skb_dequeue(struct sk_buff_head *list) | |
2329 | { | |
2330 | unsigned long flags; | |
2331 | struct sk_buff *result; | |
2332 | ||
2333 | spin_lock_irqsave(&list->lock, flags); | |
2334 | result = __skb_dequeue(list); | |
2335 | spin_unlock_irqrestore(&list->lock, flags); | |
2336 | return result; | |
2337 | } | |
2338 | EXPORT_SYMBOL(skb_dequeue); | |
2339 | ||
2340 | /** | |
2341 | * skb_dequeue_tail - remove from the tail of the queue | |
2342 | * @list: list to dequeue from | |
2343 | * | |
2344 | * Remove the tail of the list. The list lock is taken so the function | |
2345 | * may be used safely with other locking list functions. The tail item is | |
2346 | * returned or %NULL if the list is empty. | |
2347 | */ | |
2348 | struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) | |
2349 | { | |
2350 | unsigned long flags; | |
2351 | struct sk_buff *result; | |
2352 | ||
2353 | spin_lock_irqsave(&list->lock, flags); | |
2354 | result = __skb_dequeue_tail(list); | |
2355 | spin_unlock_irqrestore(&list->lock, flags); | |
2356 | return result; | |
2357 | } | |
2358 | EXPORT_SYMBOL(skb_dequeue_tail); | |
2359 | ||
2360 | /** | |
2361 | * skb_queue_purge - empty a list | |
2362 | * @list: list to empty | |
2363 | * | |
2364 | * Delete all buffers on an &sk_buff list. Each buffer is removed from | |
2365 | * the list and one reference dropped. This function takes the list | |
2366 | * lock and is atomic with respect to other list locking functions. | |
2367 | */ | |
2368 | void skb_queue_purge(struct sk_buff_head *list) | |
2369 | { | |
2370 | struct sk_buff *skb; | |
2371 | while ((skb = skb_dequeue(list)) != NULL) | |
2372 | kfree_skb(skb); | |
2373 | } | |
2374 | EXPORT_SYMBOL(skb_queue_purge); | |
2375 | ||
2376 | /** | |
2377 | * skb_queue_head - queue a buffer at the list head | |
2378 | * @list: list to use | |
2379 | * @newsk: buffer to queue | |
2380 | * | |
2381 | * Queue a buffer at the start of the list. This function takes the | |
2382 | * list lock and can be used safely with other locking &sk_buff functions | |
2383 | * safely. | |
2384 | * | |
2385 | * A buffer cannot be placed on two lists at the same time. | |
2386 | */ | |
2387 | void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) | |
2388 | { | |
2389 | unsigned long flags; | |
2390 | ||
2391 | spin_lock_irqsave(&list->lock, flags); | |
2392 | __skb_queue_head(list, newsk); | |
2393 | spin_unlock_irqrestore(&list->lock, flags); | |
2394 | } | |
2395 | EXPORT_SYMBOL(skb_queue_head); | |
2396 | ||
2397 | /** | |
2398 | * skb_queue_tail - queue a buffer at the list tail | |
2399 | * @list: list to use | |
2400 | * @newsk: buffer to queue | |
2401 | * | |
2402 | * Queue a buffer at the tail of the list. This function takes the | |
2403 | * list lock and can be used safely with other locking &sk_buff functions | |
2404 | * safely. | |
2405 | * | |
2406 | * A buffer cannot be placed on two lists at the same time. | |
2407 | */ | |
2408 | void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) | |
2409 | { | |
2410 | unsigned long flags; | |
2411 | ||
2412 | spin_lock_irqsave(&list->lock, flags); | |
2413 | __skb_queue_tail(list, newsk); | |
2414 | spin_unlock_irqrestore(&list->lock, flags); | |
2415 | } | |
2416 | EXPORT_SYMBOL(skb_queue_tail); | |
2417 | ||
2418 | /** | |
2419 | * skb_unlink - remove a buffer from a list | |
2420 | * @skb: buffer to remove | |
2421 | * @list: list to use | |
2422 | * | |
2423 | * Remove a packet from a list. The list locks are taken and this | |
2424 | * function is atomic with respect to other list locked calls | |
2425 | * | |
2426 | * You must know what list the SKB is on. | |
2427 | */ | |
2428 | void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) | |
2429 | { | |
2430 | unsigned long flags; | |
2431 | ||
2432 | spin_lock_irqsave(&list->lock, flags); | |
2433 | __skb_unlink(skb, list); | |
2434 | spin_unlock_irqrestore(&list->lock, flags); | |
2435 | } | |
2436 | EXPORT_SYMBOL(skb_unlink); | |
2437 | ||
2438 | /** | |
2439 | * skb_append - append a buffer | |
2440 | * @old: buffer to insert after | |
2441 | * @newsk: buffer to insert | |
2442 | * @list: list to use | |
2443 | * | |
2444 | * Place a packet after a given packet in a list. The list locks are taken | |
2445 | * and this function is atomic with respect to other list locked calls. | |
2446 | * A buffer cannot be placed on two lists at the same time. | |
2447 | */ | |
2448 | void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) | |
2449 | { | |
2450 | unsigned long flags; | |
2451 | ||
2452 | spin_lock_irqsave(&list->lock, flags); | |
2453 | __skb_queue_after(list, old, newsk); | |
2454 | spin_unlock_irqrestore(&list->lock, flags); | |
2455 | } | |
2456 | EXPORT_SYMBOL(skb_append); | |
2457 | ||
2458 | /** | |
2459 | * skb_insert - insert a buffer | |
2460 | * @old: buffer to insert before | |
2461 | * @newsk: buffer to insert | |
2462 | * @list: list to use | |
2463 | * | |
2464 | * Place a packet before a given packet in a list. The list locks are | |
2465 | * taken and this function is atomic with respect to other list locked | |
2466 | * calls. | |
2467 | * | |
2468 | * A buffer cannot be placed on two lists at the same time. | |
2469 | */ | |
2470 | void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) | |
2471 | { | |
2472 | unsigned long flags; | |
2473 | ||
2474 | spin_lock_irqsave(&list->lock, flags); | |
2475 | __skb_insert(newsk, old->prev, old, list); | |
2476 | spin_unlock_irqrestore(&list->lock, flags); | |
2477 | } | |
2478 | EXPORT_SYMBOL(skb_insert); | |
2479 | ||
2480 | static inline void skb_split_inside_header(struct sk_buff *skb, | |
2481 | struct sk_buff* skb1, | |
2482 | const u32 len, const int pos) | |
2483 | { | |
2484 | int i; | |
2485 | ||
2486 | skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), | |
2487 | pos - len); | |
2488 | /* And move data appendix as is. */ | |
2489 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) | |
2490 | skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; | |
2491 | ||
2492 | skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; | |
2493 | skb_shinfo(skb)->nr_frags = 0; | |
2494 | skb1->data_len = skb->data_len; | |
2495 | skb1->len += skb1->data_len; | |
2496 | skb->data_len = 0; | |
2497 | skb->len = len; | |
2498 | skb_set_tail_pointer(skb, len); | |
2499 | } | |
2500 | ||
2501 | static inline void skb_split_no_header(struct sk_buff *skb, | |
2502 | struct sk_buff* skb1, | |
2503 | const u32 len, int pos) | |
2504 | { | |
2505 | int i, k = 0; | |
2506 | const int nfrags = skb_shinfo(skb)->nr_frags; | |
2507 | ||
2508 | skb_shinfo(skb)->nr_frags = 0; | |
2509 | skb1->len = skb1->data_len = skb->len - len; | |
2510 | skb->len = len; | |
2511 | skb->data_len = len - pos; | |
2512 | ||
2513 | for (i = 0; i < nfrags; i++) { | |
2514 | int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); | |
2515 | ||
2516 | if (pos + size > len) { | |
2517 | skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; | |
2518 | ||
2519 | if (pos < len) { | |
2520 | /* Split frag. | |
2521 | * We have two variants in this case: | |
2522 | * 1. Move all the frag to the second | |
2523 | * part, if it is possible. F.e. | |
2524 | * this approach is mandatory for TUX, | |
2525 | * where splitting is expensive. | |
2526 | * 2. Split is accurately. We make this. | |
2527 | */ | |
2528 | skb_frag_ref(skb, i); | |
2529 | skb_shinfo(skb1)->frags[0].page_offset += len - pos; | |
2530 | skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); | |
2531 | skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); | |
2532 | skb_shinfo(skb)->nr_frags++; | |
2533 | } | |
2534 | k++; | |
2535 | } else | |
2536 | skb_shinfo(skb)->nr_frags++; | |
2537 | pos += size; | |
2538 | } | |
2539 | skb_shinfo(skb1)->nr_frags = k; | |
2540 | } | |
2541 | ||
2542 | /** | |
2543 | * skb_split - Split fragmented skb to two parts at length len. | |
2544 | * @skb: the buffer to split | |
2545 | * @skb1: the buffer to receive the second part | |
2546 | * @len: new length for skb | |
2547 | */ | |
2548 | void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) | |
2549 | { | |
2550 | int pos = skb_headlen(skb); | |
2551 | ||
2552 | skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; | |
2553 | if (len < pos) /* Split line is inside header. */ | |
2554 | skb_split_inside_header(skb, skb1, len, pos); | |
2555 | else /* Second chunk has no header, nothing to copy. */ | |
2556 | skb_split_no_header(skb, skb1, len, pos); | |
2557 | } | |
2558 | EXPORT_SYMBOL(skb_split); | |
2559 | ||
2560 | /* Shifting from/to a cloned skb is a no-go. | |
2561 | * | |
2562 | * Caller cannot keep skb_shinfo related pointers past calling here! | |
2563 | */ | |
2564 | static int skb_prepare_for_shift(struct sk_buff *skb) | |
2565 | { | |
2566 | return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); | |
2567 | } | |
2568 | ||
2569 | /** | |
2570 | * skb_shift - Shifts paged data partially from skb to another | |
2571 | * @tgt: buffer into which tail data gets added | |
2572 | * @skb: buffer from which the paged data comes from | |
2573 | * @shiftlen: shift up to this many bytes | |
2574 | * | |
2575 | * Attempts to shift up to shiftlen worth of bytes, which may be less than | |
2576 | * the length of the skb, from skb to tgt. Returns number bytes shifted. | |
2577 | * It's up to caller to free skb if everything was shifted. | |
2578 | * | |
2579 | * If @tgt runs out of frags, the whole operation is aborted. | |
2580 | * | |
2581 | * Skb cannot include anything else but paged data while tgt is allowed | |
2582 | * to have non-paged data as well. | |
2583 | * | |
2584 | * TODO: full sized shift could be optimized but that would need | |
2585 | * specialized skb free'er to handle frags without up-to-date nr_frags. | |
2586 | */ | |
2587 | int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) | |
2588 | { | |
2589 | int from, to, merge, todo; | |
2590 | struct skb_frag_struct *fragfrom, *fragto; | |
2591 | ||
2592 | BUG_ON(shiftlen > skb->len); | |
2593 | BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ | |
2594 | ||
2595 | todo = shiftlen; | |
2596 | from = 0; | |
2597 | to = skb_shinfo(tgt)->nr_frags; | |
2598 | fragfrom = &skb_shinfo(skb)->frags[from]; | |
2599 | ||
2600 | /* Actual merge is delayed until the point when we know we can | |
2601 | * commit all, so that we don't have to undo partial changes | |
2602 | */ | |
2603 | if (!to || | |
2604 | !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), | |
2605 | fragfrom->page_offset)) { | |
2606 | merge = -1; | |
2607 | } else { | |
2608 | merge = to - 1; | |
2609 | ||
2610 | todo -= skb_frag_size(fragfrom); | |
2611 | if (todo < 0) { | |
2612 | if (skb_prepare_for_shift(skb) || | |
2613 | skb_prepare_for_shift(tgt)) | |
2614 | return 0; | |
2615 | ||
2616 | /* All previous frag pointers might be stale! */ | |
2617 | fragfrom = &skb_shinfo(skb)->frags[from]; | |
2618 | fragto = &skb_shinfo(tgt)->frags[merge]; | |
2619 | ||
2620 | skb_frag_size_add(fragto, shiftlen); | |
2621 | skb_frag_size_sub(fragfrom, shiftlen); | |
2622 | fragfrom->page_offset += shiftlen; | |
2623 | ||
2624 | goto onlymerged; | |
2625 | } | |
2626 | ||
2627 | from++; | |
2628 | } | |
2629 | ||
2630 | /* Skip full, not-fitting skb to avoid expensive operations */ | |
2631 | if ((shiftlen == skb->len) && | |
2632 | (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) | |
2633 | return 0; | |
2634 | ||
2635 | if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) | |
2636 | return 0; | |
2637 | ||
2638 | while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { | |
2639 | if (to == MAX_SKB_FRAGS) | |
2640 | return 0; | |
2641 | ||
2642 | fragfrom = &skb_shinfo(skb)->frags[from]; | |
2643 | fragto = &skb_shinfo(tgt)->frags[to]; | |
2644 | ||
2645 | if (todo >= skb_frag_size(fragfrom)) { | |
2646 | *fragto = *fragfrom; | |
2647 | todo -= skb_frag_size(fragfrom); | |
2648 | from++; | |
2649 | to++; | |
2650 | ||
2651 | } else { | |
2652 | __skb_frag_ref(fragfrom); | |
2653 | fragto->page = fragfrom->page; | |
2654 | fragto->page_offset = fragfrom->page_offset; | |
2655 | skb_frag_size_set(fragto, todo); | |
2656 | ||
2657 | fragfrom->page_offset += todo; | |
2658 | skb_frag_size_sub(fragfrom, todo); | |
2659 | todo = 0; | |
2660 | ||
2661 | to++; | |
2662 | break; | |
2663 | } | |
2664 | } | |
2665 | ||
2666 | /* Ready to "commit" this state change to tgt */ | |
2667 | skb_shinfo(tgt)->nr_frags = to; | |
2668 | ||
2669 | if (merge >= 0) { | |
2670 | fragfrom = &skb_shinfo(skb)->frags[0]; | |
2671 | fragto = &skb_shinfo(tgt)->frags[merge]; | |
2672 | ||
2673 | skb_frag_size_add(fragto, skb_frag_size(fragfrom)); | |
2674 | __skb_frag_unref(fragfrom); | |
2675 | } | |
2676 | ||
2677 | /* Reposition in the original skb */ | |
2678 | to = 0; | |
2679 | while (from < skb_shinfo(skb)->nr_frags) | |
2680 | skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; | |
2681 | skb_shinfo(skb)->nr_frags = to; | |
2682 | ||
2683 | BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); | |
2684 | ||
2685 | onlymerged: | |
2686 | /* Most likely the tgt won't ever need its checksum anymore, skb on | |
2687 | * the other hand might need it if it needs to be resent | |
2688 | */ | |
2689 | tgt->ip_summed = CHECKSUM_PARTIAL; | |
2690 | skb->ip_summed = CHECKSUM_PARTIAL; | |
2691 | ||
2692 | /* Yak, is it really working this way? Some helper please? */ | |
2693 | skb->len -= shiftlen; | |
2694 | skb->data_len -= shiftlen; | |
2695 | skb->truesize -= shiftlen; | |
2696 | tgt->len += shiftlen; | |
2697 | tgt->data_len += shiftlen; | |
2698 | tgt->truesize += shiftlen; | |
2699 | ||
2700 | return shiftlen; | |
2701 | } | |
2702 | ||
2703 | /** | |
2704 | * skb_prepare_seq_read - Prepare a sequential read of skb data | |
2705 | * @skb: the buffer to read | |
2706 | * @from: lower offset of data to be read | |
2707 | * @to: upper offset of data to be read | |
2708 | * @st: state variable | |
2709 | * | |
2710 | * Initializes the specified state variable. Must be called before | |
2711 | * invoking skb_seq_read() for the first time. | |
2712 | */ | |
2713 | void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, | |
2714 | unsigned int to, struct skb_seq_state *st) | |
2715 | { | |
2716 | st->lower_offset = from; | |
2717 | st->upper_offset = to; | |
2718 | st->root_skb = st->cur_skb = skb; | |
2719 | st->frag_idx = st->stepped_offset = 0; | |
2720 | st->frag_data = NULL; | |
2721 | } | |
2722 | EXPORT_SYMBOL(skb_prepare_seq_read); | |
2723 | ||
2724 | /** | |
2725 | * skb_seq_read - Sequentially read skb data | |
2726 | * @consumed: number of bytes consumed by the caller so far | |
2727 | * @data: destination pointer for data to be returned | |
2728 | * @st: state variable | |
2729 | * | |
2730 | * Reads a block of skb data at @consumed relative to the | |
2731 | * lower offset specified to skb_prepare_seq_read(). Assigns | |
2732 | * the head of the data block to @data and returns the length | |
2733 | * of the block or 0 if the end of the skb data or the upper | |
2734 | * offset has been reached. | |
2735 | * | |
2736 | * The caller is not required to consume all of the data | |
2737 | * returned, i.e. @consumed is typically set to the number | |
2738 | * of bytes already consumed and the next call to | |
2739 | * skb_seq_read() will return the remaining part of the block. | |
2740 | * | |
2741 | * Note 1: The size of each block of data returned can be arbitrary, | |
2742 | * this limitation is the cost for zerocopy sequential | |
2743 | * reads of potentially non linear data. | |
2744 | * | |
2745 | * Note 2: Fragment lists within fragments are not implemented | |
2746 | * at the moment, state->root_skb could be replaced with | |
2747 | * a stack for this purpose. | |
2748 | */ | |
2749 | unsigned int skb_seq_read(unsigned int consumed, const u8 **data, | |
2750 | struct skb_seq_state *st) | |
2751 | { | |
2752 | unsigned int block_limit, abs_offset = consumed + st->lower_offset; | |
2753 | skb_frag_t *frag; | |
2754 | ||
2755 | if (unlikely(abs_offset >= st->upper_offset)) { | |
2756 | if (st->frag_data) { | |
2757 | kunmap_atomic(st->frag_data); | |
2758 | st->frag_data = NULL; | |
2759 | } | |
2760 | return 0; | |
2761 | } | |
2762 | ||
2763 | next_skb: | |
2764 | block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; | |
2765 | ||
2766 | if (abs_offset < block_limit && !st->frag_data) { | |
2767 | *data = st->cur_skb->data + (abs_offset - st->stepped_offset); | |
2768 | return block_limit - abs_offset; | |
2769 | } | |
2770 | ||
2771 | if (st->frag_idx == 0 && !st->frag_data) | |
2772 | st->stepped_offset += skb_headlen(st->cur_skb); | |
2773 | ||
2774 | while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { | |
2775 | frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; | |
2776 | block_limit = skb_frag_size(frag) + st->stepped_offset; | |
2777 | ||
2778 | if (abs_offset < block_limit) { | |
2779 | if (!st->frag_data) | |
2780 | st->frag_data = kmap_atomic(skb_frag_page(frag)); | |
2781 | ||
2782 | *data = (u8 *) st->frag_data + frag->page_offset + | |
2783 | (abs_offset - st->stepped_offset); | |
2784 | ||
2785 | return block_limit - abs_offset; | |
2786 | } | |
2787 | ||
2788 | if (st->frag_data) { | |
2789 | kunmap_atomic(st->frag_data); | |
2790 | st->frag_data = NULL; | |
2791 | } | |
2792 | ||
2793 | st->frag_idx++; | |
2794 | st->stepped_offset += skb_frag_size(frag); | |
2795 | } | |
2796 | ||
2797 | if (st->frag_data) { | |
2798 | kunmap_atomic(st->frag_data); | |
2799 | st->frag_data = NULL; | |
2800 | } | |
2801 | ||
2802 | if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { | |
2803 | st->cur_skb = skb_shinfo(st->root_skb)->frag_list; | |
2804 | st->frag_idx = 0; | |
2805 | goto next_skb; | |
2806 | } else if (st->cur_skb->next) { | |
2807 | st->cur_skb = st->cur_skb->next; | |
2808 | st->frag_idx = 0; | |
2809 | goto next_skb; | |
2810 | } | |
2811 | ||
2812 | return 0; | |
2813 | } | |
2814 | EXPORT_SYMBOL(skb_seq_read); | |
2815 | ||
2816 | /** | |
2817 | * skb_abort_seq_read - Abort a sequential read of skb data | |
2818 | * @st: state variable | |
2819 | * | |
2820 | * Must be called if skb_seq_read() was not called until it | |
2821 | * returned 0. | |
2822 | */ | |
2823 | void skb_abort_seq_read(struct skb_seq_state *st) | |
2824 | { | |
2825 | if (st->frag_data) | |
2826 | kunmap_atomic(st->frag_data); | |
2827 | } | |
2828 | EXPORT_SYMBOL(skb_abort_seq_read); | |
2829 | ||
2830 | #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) | |
2831 | ||
2832 | static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, | |
2833 | struct ts_config *conf, | |
2834 | struct ts_state *state) | |
2835 | { | |
2836 | return skb_seq_read(offset, text, TS_SKB_CB(state)); | |
2837 | } | |
2838 | ||
2839 | static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) | |
2840 | { | |
2841 | skb_abort_seq_read(TS_SKB_CB(state)); | |
2842 | } | |
2843 | ||
2844 | /** | |
2845 | * skb_find_text - Find a text pattern in skb data | |
2846 | * @skb: the buffer to look in | |
2847 | * @from: search offset | |
2848 | * @to: search limit | |
2849 | * @config: textsearch configuration | |
2850 | * | |
2851 | * Finds a pattern in the skb data according to the specified | |
2852 | * textsearch configuration. Use textsearch_next() to retrieve | |
2853 | * subsequent occurrences of the pattern. Returns the offset | |
2854 | * to the first occurrence or UINT_MAX if no match was found. | |
2855 | */ | |
2856 | unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, | |
2857 | unsigned int to, struct ts_config *config) | |
2858 | { | |
2859 | struct ts_state state; | |
2860 | unsigned int ret; | |
2861 | ||
2862 | config->get_next_block = skb_ts_get_next_block; | |
2863 | config->finish = skb_ts_finish; | |
2864 | ||
2865 | skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); | |
2866 | ||
2867 | ret = textsearch_find(config, &state); | |
2868 | return (ret <= to - from ? ret : UINT_MAX); | |
2869 | } | |
2870 | EXPORT_SYMBOL(skb_find_text); | |
2871 | ||
2872 | /** | |
2873 | * skb_append_datato_frags - append the user data to a skb | |
2874 | * @sk: sock structure | |
2875 | * @skb: skb structure to be appended with user data. | |
2876 | * @getfrag: call back function to be used for getting the user data | |
2877 | * @from: pointer to user message iov | |
2878 | * @length: length of the iov message | |
2879 | * | |
2880 | * Description: This procedure append the user data in the fragment part | |
2881 | * of the skb if any page alloc fails user this procedure returns -ENOMEM | |
2882 | */ | |
2883 | int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, | |
2884 | int (*getfrag)(void *from, char *to, int offset, | |
2885 | int len, int odd, struct sk_buff *skb), | |
2886 | void *from, int length) | |
2887 | { | |
2888 | int frg_cnt = skb_shinfo(skb)->nr_frags; | |
2889 | int copy; | |
2890 | int offset = 0; | |
2891 | int ret; | |
2892 | struct page_frag *pfrag = ¤t->task_frag; | |
2893 | ||
2894 | do { | |
2895 | /* Return error if we don't have space for new frag */ | |
2896 | if (frg_cnt >= MAX_SKB_FRAGS) | |
2897 | return -EMSGSIZE; | |
2898 | ||
2899 | if (!sk_page_frag_refill(sk, pfrag)) | |
2900 | return -ENOMEM; | |
2901 | ||
2902 | /* copy the user data to page */ | |
2903 | copy = min_t(int, length, pfrag->size - pfrag->offset); | |
2904 | ||
2905 | ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, | |
2906 | offset, copy, 0, skb); | |
2907 | if (ret < 0) | |
2908 | return -EFAULT; | |
2909 | ||
2910 | /* copy was successful so update the size parameters */ | |
2911 | skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, | |
2912 | copy); | |
2913 | frg_cnt++; | |
2914 | pfrag->offset += copy; | |
2915 | get_page(pfrag->page); | |
2916 | ||
2917 | skb->truesize += copy; | |
2918 | atomic_add(copy, &sk->sk_wmem_alloc); | |
2919 | skb->len += copy; | |
2920 | skb->data_len += copy; | |
2921 | offset += copy; | |
2922 | length -= copy; | |
2923 | ||
2924 | } while (length > 0); | |
2925 | ||
2926 | return 0; | |
2927 | } | |
2928 | EXPORT_SYMBOL(skb_append_datato_frags); | |
2929 | ||
2930 | int skb_append_pagefrags(struct sk_buff *skb, struct page *page, | |
2931 | int offset, size_t size) | |
2932 | { | |
2933 | int i = skb_shinfo(skb)->nr_frags; | |
2934 | ||
2935 | if (skb_can_coalesce(skb, i, page, offset)) { | |
2936 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); | |
2937 | } else if (i < MAX_SKB_FRAGS) { | |
2938 | get_page(page); | |
2939 | skb_fill_page_desc(skb, i, page, offset, size); | |
2940 | } else { | |
2941 | return -EMSGSIZE; | |
2942 | } | |
2943 | ||
2944 | return 0; | |
2945 | } | |
2946 | EXPORT_SYMBOL_GPL(skb_append_pagefrags); | |
2947 | ||
2948 | /** | |
2949 | * skb_pull_rcsum - pull skb and update receive checksum | |
2950 | * @skb: buffer to update | |
2951 | * @len: length of data pulled | |
2952 | * | |
2953 | * This function performs an skb_pull on the packet and updates | |
2954 | * the CHECKSUM_COMPLETE checksum. It should be used on | |
2955 | * receive path processing instead of skb_pull unless you know | |
2956 | * that the checksum difference is zero (e.g., a valid IP header) | |
2957 | * or you are setting ip_summed to CHECKSUM_NONE. | |
2958 | */ | |
2959 | unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) | |
2960 | { | |
2961 | BUG_ON(len > skb->len); | |
2962 | skb->len -= len; | |
2963 | BUG_ON(skb->len < skb->data_len); | |
2964 | skb_postpull_rcsum(skb, skb->data, len); | |
2965 | return skb->data += len; | |
2966 | } | |
2967 | EXPORT_SYMBOL_GPL(skb_pull_rcsum); | |
2968 | ||
2969 | /** | |
2970 | * skb_segment - Perform protocol segmentation on skb. | |
2971 | * @head_skb: buffer to segment | |
2972 | * @features: features for the output path (see dev->features) | |
2973 | * | |
2974 | * This function performs segmentation on the given skb. It returns | |
2975 | * a pointer to the first in a list of new skbs for the segments. | |
2976 | * In case of error it returns ERR_PTR(err). | |
2977 | */ | |
2978 | struct sk_buff *skb_segment(struct sk_buff *head_skb, | |
2979 | netdev_features_t features) | |
2980 | { | |
2981 | struct sk_buff *segs = NULL; | |
2982 | struct sk_buff *tail = NULL; | |
2983 | struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; | |
2984 | skb_frag_t *frag = skb_shinfo(head_skb)->frags; | |
2985 | unsigned int mss = skb_shinfo(head_skb)->gso_size; | |
2986 | unsigned int doffset = head_skb->data - skb_mac_header(head_skb); | |
2987 | struct sk_buff *frag_skb = head_skb; | |
2988 | unsigned int offset = doffset; | |
2989 | unsigned int tnl_hlen = skb_tnl_header_len(head_skb); | |
2990 | unsigned int headroom; | |
2991 | unsigned int len; | |
2992 | __be16 proto; | |
2993 | bool csum; | |
2994 | int sg = !!(features & NETIF_F_SG); | |
2995 | int nfrags = skb_shinfo(head_skb)->nr_frags; | |
2996 | int err = -ENOMEM; | |
2997 | int i = 0; | |
2998 | int pos; | |
2999 | int dummy; | |
3000 | ||
3001 | __skb_push(head_skb, doffset); | |
3002 | proto = skb_network_protocol(head_skb, &dummy); | |
3003 | if (unlikely(!proto)) | |
3004 | return ERR_PTR(-EINVAL); | |
3005 | ||
3006 | csum = !head_skb->encap_hdr_csum && | |
3007 | !!can_checksum_protocol(features, proto); | |
3008 | ||
3009 | headroom = skb_headroom(head_skb); | |
3010 | pos = skb_headlen(head_skb); | |
3011 | ||
3012 | do { | |
3013 | struct sk_buff *nskb; | |
3014 | skb_frag_t *nskb_frag; | |
3015 | int hsize; | |
3016 | int size; | |
3017 | ||
3018 | len = head_skb->len - offset; | |
3019 | if (len > mss) | |
3020 | len = mss; | |
3021 | ||
3022 | hsize = skb_headlen(head_skb) - offset; | |
3023 | if (hsize < 0) | |
3024 | hsize = 0; | |
3025 | if (hsize > len || !sg) | |
3026 | hsize = len; | |
3027 | ||
3028 | if (!hsize && i >= nfrags && skb_headlen(list_skb) && | |
3029 | (skb_headlen(list_skb) == len || sg)) { | |
3030 | BUG_ON(skb_headlen(list_skb) > len); | |
3031 | ||
3032 | i = 0; | |
3033 | nfrags = skb_shinfo(list_skb)->nr_frags; | |
3034 | frag = skb_shinfo(list_skb)->frags; | |
3035 | frag_skb = list_skb; | |
3036 | pos += skb_headlen(list_skb); | |
3037 | ||
3038 | while (pos < offset + len) { | |
3039 | BUG_ON(i >= nfrags); | |
3040 | ||
3041 | size = skb_frag_size(frag); | |
3042 | if (pos + size > offset + len) | |
3043 | break; | |
3044 | ||
3045 | i++; | |
3046 | pos += size; | |
3047 | frag++; | |
3048 | } | |
3049 | ||
3050 | nskb = skb_clone(list_skb, GFP_ATOMIC); | |
3051 | list_skb = list_skb->next; | |
3052 | ||
3053 | if (unlikely(!nskb)) | |
3054 | goto err; | |
3055 | ||
3056 | if (unlikely(pskb_trim(nskb, len))) { | |
3057 | kfree_skb(nskb); | |
3058 | goto err; | |
3059 | } | |
3060 | ||
3061 | hsize = skb_end_offset(nskb); | |
3062 | if (skb_cow_head(nskb, doffset + headroom)) { | |
3063 | kfree_skb(nskb); | |
3064 | goto err; | |
3065 | } | |
3066 | ||
3067 | nskb->truesize += skb_end_offset(nskb) - hsize; | |
3068 | skb_release_head_state(nskb); | |
3069 | __skb_push(nskb, doffset); | |
3070 | } else { | |
3071 | nskb = __alloc_skb(hsize + doffset + headroom, | |
3072 | GFP_ATOMIC, skb_alloc_rx_flag(head_skb), | |
3073 | NUMA_NO_NODE); | |
3074 | ||
3075 | if (unlikely(!nskb)) | |
3076 | goto err; | |
3077 | ||
3078 | skb_reserve(nskb, headroom); | |
3079 | __skb_put(nskb, doffset); | |
3080 | } | |
3081 | ||
3082 | if (segs) | |
3083 | tail->next = nskb; | |
3084 | else | |
3085 | segs = nskb; | |
3086 | tail = nskb; | |
3087 | ||
3088 | __copy_skb_header(nskb, head_skb); | |
3089 | ||
3090 | skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); | |
3091 | skb_reset_mac_len(nskb); | |
3092 | ||
3093 | skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, | |
3094 | nskb->data - tnl_hlen, | |
3095 | doffset + tnl_hlen); | |
3096 | ||
3097 | if (nskb->len == len + doffset) | |
3098 | goto perform_csum_check; | |
3099 | ||
3100 | if (!sg && !nskb->remcsum_offload) { | |
3101 | nskb->ip_summed = CHECKSUM_NONE; | |
3102 | nskb->csum = skb_copy_and_csum_bits(head_skb, offset, | |
3103 | skb_put(nskb, len), | |
3104 | len, 0); | |
3105 | SKB_GSO_CB(nskb)->csum_start = | |
3106 | skb_headroom(nskb) + doffset; | |
3107 | continue; | |
3108 | } | |
3109 | ||
3110 | nskb_frag = skb_shinfo(nskb)->frags; | |
3111 | ||
3112 | skb_copy_from_linear_data_offset(head_skb, offset, | |
3113 | skb_put(nskb, hsize), hsize); | |
3114 | ||
3115 | skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & | |
3116 | SKBTX_SHARED_FRAG; | |
3117 | ||
3118 | while (pos < offset + len) { | |
3119 | if (i >= nfrags) { | |
3120 | BUG_ON(skb_headlen(list_skb)); | |
3121 | ||
3122 | i = 0; | |
3123 | nfrags = skb_shinfo(list_skb)->nr_frags; | |
3124 | frag = skb_shinfo(list_skb)->frags; | |
3125 | frag_skb = list_skb; | |
3126 | ||
3127 | BUG_ON(!nfrags); | |
3128 | ||
3129 | list_skb = list_skb->next; | |
3130 | } | |
3131 | ||
3132 | if (unlikely(skb_shinfo(nskb)->nr_frags >= | |
3133 | MAX_SKB_FRAGS)) { | |
3134 | net_warn_ratelimited( | |
3135 | "skb_segment: too many frags: %u %u\n", | |
3136 | pos, mss); | |
3137 | goto err; | |
3138 | } | |
3139 | ||
3140 | if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) | |
3141 | goto err; | |
3142 | ||
3143 | *nskb_frag = *frag; | |
3144 | __skb_frag_ref(nskb_frag); | |
3145 | size = skb_frag_size(nskb_frag); | |
3146 | ||
3147 | if (pos < offset) { | |
3148 | nskb_frag->page_offset += offset - pos; | |
3149 | skb_frag_size_sub(nskb_frag, offset - pos); | |
3150 | } | |
3151 | ||
3152 | skb_shinfo(nskb)->nr_frags++; | |
3153 | ||
3154 | if (pos + size <= offset + len) { | |
3155 | i++; | |
3156 | frag++; | |
3157 | pos += size; | |
3158 | } else { | |
3159 | skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); | |
3160 | goto skip_fraglist; | |
3161 | } | |
3162 | ||
3163 | nskb_frag++; | |
3164 | } | |
3165 | ||
3166 | skip_fraglist: | |
3167 | nskb->data_len = len - hsize; | |
3168 | nskb->len += nskb->data_len; | |
3169 | nskb->truesize += nskb->data_len; | |
3170 | ||
3171 | perform_csum_check: | |
3172 | if (!csum && !nskb->remcsum_offload) { | |
3173 | nskb->csum = skb_checksum(nskb, doffset, | |
3174 | nskb->len - doffset, 0); | |
3175 | nskb->ip_summed = CHECKSUM_NONE; | |
3176 | SKB_GSO_CB(nskb)->csum_start = | |
3177 | skb_headroom(nskb) + doffset; | |
3178 | } | |
3179 | } while ((offset += len) < head_skb->len); | |
3180 | ||
3181 | /* Some callers want to get the end of the list. | |
3182 | * Put it in segs->prev to avoid walking the list. | |
3183 | * (see validate_xmit_skb_list() for example) | |
3184 | */ | |
3185 | segs->prev = tail; | |
3186 | ||
3187 | /* Following permits correct backpressure, for protocols | |
3188 | * using skb_set_owner_w(). | |
3189 | * Idea is to tranfert ownership from head_skb to last segment. | |
3190 | */ | |
3191 | if (head_skb->destructor == sock_wfree) { | |
3192 | swap(tail->truesize, head_skb->truesize); | |
3193 | swap(tail->destructor, head_skb->destructor); | |
3194 | swap(tail->sk, head_skb->sk); | |
3195 | } | |
3196 | return segs; | |
3197 | ||
3198 | err: | |
3199 | kfree_skb_list(segs); | |
3200 | return ERR_PTR(err); | |
3201 | } | |
3202 | EXPORT_SYMBOL_GPL(skb_segment); | |
3203 | ||
3204 | int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) | |
3205 | { | |
3206 | struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); | |
3207 | unsigned int offset = skb_gro_offset(skb); | |
3208 | unsigned int headlen = skb_headlen(skb); | |
3209 | unsigned int len = skb_gro_len(skb); | |
3210 | struct sk_buff *lp, *p = *head; | |
3211 | unsigned int delta_truesize; | |
3212 | ||
3213 | if (unlikely(p->len + len >= 65536)) | |
3214 | return -E2BIG; | |
3215 | ||
3216 | lp = NAPI_GRO_CB(p)->last; | |
3217 | pinfo = skb_shinfo(lp); | |
3218 | ||
3219 | if (headlen <= offset) { | |
3220 | skb_frag_t *frag; | |
3221 | skb_frag_t *frag2; | |
3222 | int i = skbinfo->nr_frags; | |
3223 | int nr_frags = pinfo->nr_frags + i; | |
3224 | ||
3225 | if (nr_frags > MAX_SKB_FRAGS) | |
3226 | goto merge; | |
3227 | ||
3228 | offset -= headlen; | |
3229 | pinfo->nr_frags = nr_frags; | |
3230 | skbinfo->nr_frags = 0; | |
3231 | ||
3232 | frag = pinfo->frags + nr_frags; | |
3233 | frag2 = skbinfo->frags + i; | |
3234 | do { | |
3235 | *--frag = *--frag2; | |
3236 | } while (--i); | |
3237 | ||
3238 | frag->page_offset += offset; | |
3239 | skb_frag_size_sub(frag, offset); | |
3240 | ||
3241 | /* all fragments truesize : remove (head size + sk_buff) */ | |
3242 | delta_truesize = skb->truesize - | |
3243 | SKB_TRUESIZE(skb_end_offset(skb)); | |
3244 | ||
3245 | skb->truesize -= skb->data_len; | |
3246 | skb->len -= skb->data_len; | |
3247 | skb->data_len = 0; | |
3248 | ||
3249 | NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; | |
3250 | goto done; | |
3251 | } else if (skb->head_frag) { | |
3252 | int nr_frags = pinfo->nr_frags; | |
3253 | skb_frag_t *frag = pinfo->frags + nr_frags; | |
3254 | struct page *page = virt_to_head_page(skb->head); | |
3255 | unsigned int first_size = headlen - offset; | |
3256 | unsigned int first_offset; | |
3257 | ||
3258 | if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) | |
3259 | goto merge; | |
3260 | ||
3261 | first_offset = skb->data - | |
3262 | (unsigned char *)page_address(page) + | |
3263 | offset; | |
3264 | ||
3265 | pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; | |
3266 | ||
3267 | frag->page.p = page; | |
3268 | frag->page_offset = first_offset; | |
3269 | skb_frag_size_set(frag, first_size); | |
3270 | ||
3271 | memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); | |
3272 | /* We dont need to clear skbinfo->nr_frags here */ | |
3273 | ||
3274 | delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); | |
3275 | NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; | |
3276 | goto done; | |
3277 | } | |
3278 | ||
3279 | merge: | |
3280 | delta_truesize = skb->truesize; | |
3281 | if (offset > headlen) { | |
3282 | unsigned int eat = offset - headlen; | |
3283 | ||
3284 | skbinfo->frags[0].page_offset += eat; | |
3285 | skb_frag_size_sub(&skbinfo->frags[0], eat); | |
3286 | skb->data_len -= eat; | |
3287 | skb->len -= eat; | |
3288 | offset = headlen; | |
3289 | } | |
3290 | ||
3291 | __skb_pull(skb, offset); | |
3292 | ||
3293 | if (NAPI_GRO_CB(p)->last == p) | |
3294 | skb_shinfo(p)->frag_list = skb; | |
3295 | else | |
3296 | NAPI_GRO_CB(p)->last->next = skb; | |
3297 | NAPI_GRO_CB(p)->last = skb; | |
3298 | __skb_header_release(skb); | |
3299 | lp = p; | |
3300 | ||
3301 | done: | |
3302 | NAPI_GRO_CB(p)->count++; | |
3303 | p->data_len += len; | |
3304 | p->truesize += delta_truesize; | |
3305 | p->len += len; | |
3306 | if (lp != p) { | |
3307 | lp->data_len += len; | |
3308 | lp->truesize += delta_truesize; | |
3309 | lp->len += len; | |
3310 | } | |
3311 | NAPI_GRO_CB(skb)->same_flow = 1; | |
3312 | return 0; | |
3313 | } | |
3314 | ||
3315 | void __init skb_init(void) | |
3316 | { | |
3317 | skbuff_head_cache = kmem_cache_create("skbuff_head_cache", | |
3318 | sizeof(struct sk_buff), | |
3319 | 0, | |
3320 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, | |
3321 | NULL); | |
3322 | skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", | |
3323 | sizeof(struct sk_buff_fclones), | |
3324 | 0, | |
3325 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, | |
3326 | NULL); | |
3327 | } | |
3328 | ||
3329 | /** | |
3330 | * skb_to_sgvec - Fill a scatter-gather list from a socket buffer | |
3331 | * @skb: Socket buffer containing the buffers to be mapped | |
3332 | * @sg: The scatter-gather list to map into | |
3333 | * @offset: The offset into the buffer's contents to start mapping | |
3334 | * @len: Length of buffer space to be mapped | |
3335 | * | |
3336 | * Fill the specified scatter-gather list with mappings/pointers into a | |
3337 | * region of the buffer space attached to a socket buffer. | |
3338 | */ | |
3339 | static int | |
3340 | __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) | |
3341 | { | |
3342 | int start = skb_headlen(skb); | |
3343 | int i, copy = start - offset; | |
3344 | struct sk_buff *frag_iter; | |
3345 | int elt = 0; | |
3346 | ||
3347 | if (copy > 0) { | |
3348 | if (copy > len) | |
3349 | copy = len; | |
3350 | sg_set_buf(sg, skb->data + offset, copy); | |
3351 | elt++; | |
3352 | if ((len -= copy) == 0) | |
3353 | return elt; | |
3354 | offset += copy; | |
3355 | } | |
3356 | ||
3357 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | |
3358 | int end; | |
3359 | ||
3360 | WARN_ON(start > offset + len); | |
3361 | ||
3362 | end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); | |
3363 | if ((copy = end - offset) > 0) { | |
3364 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | |
3365 | ||
3366 | if (copy > len) | |
3367 | copy = len; | |
3368 | sg_set_page(&sg[elt], skb_frag_page(frag), copy, | |
3369 | frag->page_offset+offset-start); | |
3370 | elt++; | |
3371 | if (!(len -= copy)) | |
3372 | return elt; | |
3373 | offset += copy; | |
3374 | } | |
3375 | start = end; | |
3376 | } | |
3377 | ||
3378 | skb_walk_frags(skb, frag_iter) { | |
3379 | int end; | |
3380 | ||
3381 | WARN_ON(start > offset + len); | |
3382 | ||
3383 | end = start + frag_iter->len; | |
3384 | if ((copy = end - offset) > 0) { | |
3385 | if (copy > len) | |
3386 | copy = len; | |
3387 | elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, | |
3388 | copy); | |
3389 | if ((len -= copy) == 0) | |
3390 | return elt; | |
3391 | offset += copy; | |
3392 | } | |
3393 | start = end; | |
3394 | } | |
3395 | BUG_ON(len); | |
3396 | return elt; | |
3397 | } | |
3398 | ||
3399 | /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given | |
3400 | * sglist without mark the sg which contain last skb data as the end. | |
3401 | * So the caller can mannipulate sg list as will when padding new data after | |
3402 | * the first call without calling sg_unmark_end to expend sg list. | |
3403 | * | |
3404 | * Scenario to use skb_to_sgvec_nomark: | |
3405 | * 1. sg_init_table | |
3406 | * 2. skb_to_sgvec_nomark(payload1) | |
3407 | * 3. skb_to_sgvec_nomark(payload2) | |
3408 | * | |
3409 | * This is equivalent to: | |
3410 | * 1. sg_init_table | |
3411 | * 2. skb_to_sgvec(payload1) | |
3412 | * 3. sg_unmark_end | |
3413 | * 4. skb_to_sgvec(payload2) | |
3414 | * | |
3415 | * When mapping mutilple payload conditionally, skb_to_sgvec_nomark | |
3416 | * is more preferable. | |
3417 | */ | |
3418 | int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, | |
3419 | int offset, int len) | |
3420 | { | |
3421 | return __skb_to_sgvec(skb, sg, offset, len); | |
3422 | } | |
3423 | EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); | |
3424 | ||
3425 | int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) | |
3426 | { | |
3427 | int nsg = __skb_to_sgvec(skb, sg, offset, len); | |
3428 | ||
3429 | sg_mark_end(&sg[nsg - 1]); | |
3430 | ||
3431 | return nsg; | |
3432 | } | |
3433 | EXPORT_SYMBOL_GPL(skb_to_sgvec); | |
3434 | ||
3435 | /** | |
3436 | * skb_cow_data - Check that a socket buffer's data buffers are writable | |
3437 | * @skb: The socket buffer to check. | |
3438 | * @tailbits: Amount of trailing space to be added | |
3439 | * @trailer: Returned pointer to the skb where the @tailbits space begins | |
3440 | * | |
3441 | * Make sure that the data buffers attached to a socket buffer are | |
3442 | * writable. If they are not, private copies are made of the data buffers | |
3443 | * and the socket buffer is set to use these instead. | |
3444 | * | |
3445 | * If @tailbits is given, make sure that there is space to write @tailbits | |
3446 | * bytes of data beyond current end of socket buffer. @trailer will be | |
3447 | * set to point to the skb in which this space begins. | |
3448 | * | |
3449 | * The number of scatterlist elements required to completely map the | |
3450 | * COW'd and extended socket buffer will be returned. | |
3451 | */ | |
3452 | int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) | |
3453 | { | |
3454 | int copyflag; | |
3455 | int elt; | |
3456 | struct sk_buff *skb1, **skb_p; | |
3457 | ||
3458 | /* If skb is cloned or its head is paged, reallocate | |
3459 | * head pulling out all the pages (pages are considered not writable | |
3460 | * at the moment even if they are anonymous). | |
3461 | */ | |
3462 | if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && | |
3463 | __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) | |
3464 | return -ENOMEM; | |
3465 | ||
3466 | /* Easy case. Most of packets will go this way. */ | |
3467 | if (!skb_has_frag_list(skb)) { | |
3468 | /* A little of trouble, not enough of space for trailer. | |
3469 | * This should not happen, when stack is tuned to generate | |
3470 | * good frames. OK, on miss we reallocate and reserve even more | |
3471 | * space, 128 bytes is fair. */ | |
3472 | ||
3473 | if (skb_tailroom(skb) < tailbits && | |
3474 | pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) | |
3475 | return -ENOMEM; | |
3476 | ||
3477 | /* Voila! */ | |
3478 | *trailer = skb; | |
3479 | return 1; | |
3480 | } | |
3481 | ||
3482 | /* Misery. We are in troubles, going to mincer fragments... */ | |
3483 | ||
3484 | elt = 1; | |
3485 | skb_p = &skb_shinfo(skb)->frag_list; | |
3486 | copyflag = 0; | |
3487 | ||
3488 | while ((skb1 = *skb_p) != NULL) { | |
3489 | int ntail = 0; | |
3490 | ||
3491 | /* The fragment is partially pulled by someone, | |
3492 | * this can happen on input. Copy it and everything | |
3493 | * after it. */ | |
3494 | ||
3495 | if (skb_shared(skb1)) | |
3496 | copyflag = 1; | |
3497 | ||
3498 | /* If the skb is the last, worry about trailer. */ | |
3499 | ||
3500 | if (skb1->next == NULL && tailbits) { | |
3501 | if (skb_shinfo(skb1)->nr_frags || | |
3502 | skb_has_frag_list(skb1) || | |
3503 | skb_tailroom(skb1) < tailbits) | |
3504 | ntail = tailbits + 128; | |
3505 | } | |
3506 | ||
3507 | if (copyflag || | |
3508 | skb_cloned(skb1) || | |
3509 | ntail || | |
3510 | skb_shinfo(skb1)->nr_frags || | |
3511 | skb_has_frag_list(skb1)) { | |
3512 | struct sk_buff *skb2; | |
3513 | ||
3514 | /* Fuck, we are miserable poor guys... */ | |
3515 | if (ntail == 0) | |
3516 | skb2 = skb_copy(skb1, GFP_ATOMIC); | |
3517 | else | |
3518 | skb2 = skb_copy_expand(skb1, | |
3519 | skb_headroom(skb1), | |
3520 | ntail, | |
3521 | GFP_ATOMIC); | |
3522 | if (unlikely(skb2 == NULL)) | |
3523 | return -ENOMEM; | |
3524 | ||
3525 | if (skb1->sk) | |
3526 | skb_set_owner_w(skb2, skb1->sk); | |
3527 | ||
3528 | /* Looking around. Are we still alive? | |
3529 | * OK, link new skb, drop old one */ | |
3530 | ||
3531 | skb2->next = skb1->next; | |
3532 | *skb_p = skb2; | |
3533 | kfree_skb(skb1); | |
3534 | skb1 = skb2; | |
3535 | } | |
3536 | elt++; | |
3537 | *trailer = skb1; | |
3538 | skb_p = &skb1->next; | |
3539 | } | |
3540 | ||
3541 | return elt; | |
3542 | } | |
3543 | EXPORT_SYMBOL_GPL(skb_cow_data); | |
3544 | ||
3545 | static void sock_rmem_free(struct sk_buff *skb) | |
3546 | { | |
3547 | struct sock *sk = skb->sk; | |
3548 | ||
3549 | atomic_sub(skb->truesize, &sk->sk_rmem_alloc); | |
3550 | } | |
3551 | ||
3552 | /* | |
3553 | * Note: We dont mem charge error packets (no sk_forward_alloc changes) | |
3554 | */ | |
3555 | int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) | |
3556 | { | |
3557 | if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= | |
3558 | (unsigned int)sk->sk_rcvbuf) | |
3559 | return -ENOMEM; | |
3560 | ||
3561 | skb_orphan(skb); | |
3562 | skb->sk = sk; | |
3563 | skb->destructor = sock_rmem_free; | |
3564 | atomic_add(skb->truesize, &sk->sk_rmem_alloc); | |
3565 | ||
3566 | /* before exiting rcu section, make sure dst is refcounted */ | |
3567 | skb_dst_force(skb); | |
3568 | ||
3569 | skb_queue_tail(&sk->sk_error_queue, skb); | |
3570 | if (!sock_flag(sk, SOCK_DEAD)) | |
3571 | sk->sk_data_ready(sk); | |
3572 | return 0; | |
3573 | } | |
3574 | EXPORT_SYMBOL(sock_queue_err_skb); | |
3575 | ||
3576 | struct sk_buff *sock_dequeue_err_skb(struct sock *sk) | |
3577 | { | |
3578 | struct sk_buff_head *q = &sk->sk_error_queue; | |
3579 | struct sk_buff *skb, *skb_next; | |
3580 | unsigned long flags; | |
3581 | int err = 0; | |
3582 | ||
3583 | spin_lock_irqsave(&q->lock, flags); | |
3584 | skb = __skb_dequeue(q); | |
3585 | if (skb && (skb_next = skb_peek(q))) | |
3586 | err = SKB_EXT_ERR(skb_next)->ee.ee_errno; | |
3587 | spin_unlock_irqrestore(&q->lock, flags); | |
3588 | ||
3589 | sk->sk_err = err; | |
3590 | if (err) | |
3591 | sk->sk_error_report(sk); | |
3592 | ||
3593 | return skb; | |
3594 | } | |
3595 | EXPORT_SYMBOL(sock_dequeue_err_skb); | |
3596 | ||
3597 | /** | |
3598 | * skb_clone_sk - create clone of skb, and take reference to socket | |
3599 | * @skb: the skb to clone | |
3600 | * | |
3601 | * This function creates a clone of a buffer that holds a reference on | |
3602 | * sk_refcnt. Buffers created via this function are meant to be | |
3603 | * returned using sock_queue_err_skb, or free via kfree_skb. | |
3604 | * | |
3605 | * When passing buffers allocated with this function to sock_queue_err_skb | |
3606 | * it is necessary to wrap the call with sock_hold/sock_put in order to | |
3607 | * prevent the socket from being released prior to being enqueued on | |
3608 | * the sk_error_queue. | |
3609 | */ | |
3610 | struct sk_buff *skb_clone_sk(struct sk_buff *skb) | |
3611 | { | |
3612 | struct sock *sk = skb->sk; | |
3613 | struct sk_buff *clone; | |
3614 | ||
3615 | if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) | |
3616 | return NULL; | |
3617 | ||
3618 | clone = skb_clone(skb, GFP_ATOMIC); | |
3619 | if (!clone) { | |
3620 | sock_put(sk); | |
3621 | return NULL; | |
3622 | } | |
3623 | ||
3624 | clone->sk = sk; | |
3625 | clone->destructor = sock_efree; | |
3626 | ||
3627 | return clone; | |
3628 | } | |
3629 | EXPORT_SYMBOL(skb_clone_sk); | |
3630 | ||
3631 | static void __skb_complete_tx_timestamp(struct sk_buff *skb, | |
3632 | struct sock *sk, | |
3633 | int tstype) | |
3634 | { | |
3635 | struct sock_exterr_skb *serr; | |
3636 | int err; | |
3637 | ||
3638 | serr = SKB_EXT_ERR(skb); | |
3639 | memset(serr, 0, sizeof(*serr)); | |
3640 | serr->ee.ee_errno = ENOMSG; | |
3641 | serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; | |
3642 | serr->ee.ee_info = tstype; | |
3643 | if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { | |
3644 | serr->ee.ee_data = skb_shinfo(skb)->tskey; | |
3645 | if (sk->sk_protocol == IPPROTO_TCP) | |
3646 | serr->ee.ee_data -= sk->sk_tskey; | |
3647 | } | |
3648 | ||
3649 | err = sock_queue_err_skb(sk, skb); | |
3650 | ||
3651 | if (err) | |
3652 | kfree_skb(skb); | |
3653 | } | |
3654 | ||
3655 | static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) | |
3656 | { | |
3657 | bool ret; | |
3658 | ||
3659 | if (likely(sysctl_tstamp_allow_data || tsonly)) | |
3660 | return true; | |
3661 | ||
3662 | read_lock_bh(&sk->sk_callback_lock); | |
3663 | ret = sk->sk_socket && sk->sk_socket->file && | |
3664 | file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); | |
3665 | read_unlock_bh(&sk->sk_callback_lock); | |
3666 | return ret; | |
3667 | } | |
3668 | ||
3669 | void skb_complete_tx_timestamp(struct sk_buff *skb, | |
3670 | struct skb_shared_hwtstamps *hwtstamps) | |
3671 | { | |
3672 | struct sock *sk = skb->sk; | |
3673 | ||
3674 | if (!skb_may_tx_timestamp(sk, false)) | |
3675 | return; | |
3676 | ||
3677 | /* take a reference to prevent skb_orphan() from freeing the socket */ | |
3678 | sock_hold(sk); | |
3679 | ||
3680 | *skb_hwtstamps(skb) = *hwtstamps; | |
3681 | __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); | |
3682 | ||
3683 | sock_put(sk); | |
3684 | } | |
3685 | EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); | |
3686 | ||
3687 | void __skb_tstamp_tx(struct sk_buff *orig_skb, | |
3688 | struct skb_shared_hwtstamps *hwtstamps, | |
3689 | struct sock *sk, int tstype) | |
3690 | { | |
3691 | struct sk_buff *skb; | |
3692 | bool tsonly; | |
3693 | ||
3694 | if (!sk) | |
3695 | return; | |
3696 | ||
3697 | tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; | |
3698 | if (!skb_may_tx_timestamp(sk, tsonly)) | |
3699 | return; | |
3700 | ||
3701 | if (tsonly) | |
3702 | skb = alloc_skb(0, GFP_ATOMIC); | |
3703 | else | |
3704 | skb = skb_clone(orig_skb, GFP_ATOMIC); | |
3705 | if (!skb) | |
3706 | return; | |
3707 | ||
3708 | if (tsonly) { | |
3709 | skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; | |
3710 | skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; | |
3711 | } | |
3712 | ||
3713 | if (hwtstamps) | |
3714 | *skb_hwtstamps(skb) = *hwtstamps; | |
3715 | else | |
3716 | skb->tstamp = ktime_get_real(); | |
3717 | ||
3718 | __skb_complete_tx_timestamp(skb, sk, tstype); | |
3719 | } | |
3720 | EXPORT_SYMBOL_GPL(__skb_tstamp_tx); | |
3721 | ||
3722 | void skb_tstamp_tx(struct sk_buff *orig_skb, | |
3723 | struct skb_shared_hwtstamps *hwtstamps) | |
3724 | { | |
3725 | return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, | |
3726 | SCM_TSTAMP_SND); | |
3727 | } | |
3728 | EXPORT_SYMBOL_GPL(skb_tstamp_tx); | |
3729 | ||
3730 | void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) | |
3731 | { | |
3732 | struct sock *sk = skb->sk; | |
3733 | struct sock_exterr_skb *serr; | |
3734 | int err; | |
3735 | ||
3736 | skb->wifi_acked_valid = 1; | |
3737 | skb->wifi_acked = acked; | |
3738 | ||
3739 | serr = SKB_EXT_ERR(skb); | |
3740 | memset(serr, 0, sizeof(*serr)); | |
3741 | serr->ee.ee_errno = ENOMSG; | |
3742 | serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; | |
3743 | ||
3744 | /* take a reference to prevent skb_orphan() from freeing the socket */ | |
3745 | sock_hold(sk); | |
3746 | ||
3747 | err = sock_queue_err_skb(sk, skb); | |
3748 | if (err) | |
3749 | kfree_skb(skb); | |
3750 | ||
3751 | sock_put(sk); | |
3752 | } | |
3753 | EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); | |
3754 | ||
3755 | /** | |
3756 | * skb_partial_csum_set - set up and verify partial csum values for packet | |
3757 | * @skb: the skb to set | |
3758 | * @start: the number of bytes after skb->data to start checksumming. | |
3759 | * @off: the offset from start to place the checksum. | |
3760 | * | |
3761 | * For untrusted partially-checksummed packets, we need to make sure the values | |
3762 | * for skb->csum_start and skb->csum_offset are valid so we don't oops. | |
3763 | * | |
3764 | * This function checks and sets those values and skb->ip_summed: if this | |
3765 | * returns false you should drop the packet. | |
3766 | */ | |
3767 | bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) | |
3768 | { | |
3769 | if (unlikely(start > skb_headlen(skb)) || | |
3770 | unlikely((int)start + off > skb_headlen(skb) - 2)) { | |
3771 | net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", | |
3772 | start, off, skb_headlen(skb)); | |
3773 | return false; | |
3774 | } | |
3775 | skb->ip_summed = CHECKSUM_PARTIAL; | |
3776 | skb->csum_start = skb_headroom(skb) + start; | |
3777 | skb->csum_offset = off; | |
3778 | skb_set_transport_header(skb, start); | |
3779 | return true; | |
3780 | } | |
3781 | EXPORT_SYMBOL_GPL(skb_partial_csum_set); | |
3782 | ||
3783 | static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, | |
3784 | unsigned int max) | |
3785 | { | |
3786 | if (skb_headlen(skb) >= len) | |
3787 | return 0; | |
3788 | ||
3789 | /* If we need to pullup then pullup to the max, so we | |
3790 | * won't need to do it again. | |
3791 | */ | |
3792 | if (max > skb->len) | |
3793 | max = skb->len; | |
3794 | ||
3795 | if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) | |
3796 | return -ENOMEM; | |
3797 | ||
3798 | if (skb_headlen(skb) < len) | |
3799 | return -EPROTO; | |
3800 | ||
3801 | return 0; | |
3802 | } | |
3803 | ||
3804 | #define MAX_TCP_HDR_LEN (15 * 4) | |
3805 | ||
3806 | static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, | |
3807 | typeof(IPPROTO_IP) proto, | |
3808 | unsigned int off) | |
3809 | { | |
3810 | switch (proto) { | |
3811 | int err; | |
3812 | ||
3813 | case IPPROTO_TCP: | |
3814 | err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), | |
3815 | off + MAX_TCP_HDR_LEN); | |
3816 | if (!err && !skb_partial_csum_set(skb, off, | |
3817 | offsetof(struct tcphdr, | |
3818 | check))) | |
3819 | err = -EPROTO; | |
3820 | return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; | |
3821 | ||
3822 | case IPPROTO_UDP: | |
3823 | err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), | |
3824 | off + sizeof(struct udphdr)); | |
3825 | if (!err && !skb_partial_csum_set(skb, off, | |
3826 | offsetof(struct udphdr, | |
3827 | check))) | |
3828 | err = -EPROTO; | |
3829 | return err ? ERR_PTR(err) : &udp_hdr(skb)->check; | |
3830 | } | |
3831 | ||
3832 | return ERR_PTR(-EPROTO); | |
3833 | } | |
3834 | ||
3835 | /* This value should be large enough to cover a tagged ethernet header plus | |
3836 | * maximally sized IP and TCP or UDP headers. | |
3837 | */ | |
3838 | #define MAX_IP_HDR_LEN 128 | |
3839 | ||
3840 | static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) | |
3841 | { | |
3842 | unsigned int off; | |
3843 | bool fragment; | |
3844 | __sum16 *csum; | |
3845 | int err; | |
3846 | ||
3847 | fragment = false; | |
3848 | ||
3849 | err = skb_maybe_pull_tail(skb, | |
3850 | sizeof(struct iphdr), | |
3851 | MAX_IP_HDR_LEN); | |
3852 | if (err < 0) | |
3853 | goto out; | |
3854 | ||
3855 | if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) | |
3856 | fragment = true; | |
3857 | ||
3858 | off = ip_hdrlen(skb); | |
3859 | ||
3860 | err = -EPROTO; | |
3861 | ||
3862 | if (fragment) | |
3863 | goto out; | |
3864 | ||
3865 | csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); | |
3866 | if (IS_ERR(csum)) | |
3867 | return PTR_ERR(csum); | |
3868 | ||
3869 | if (recalculate) | |
3870 | *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, | |
3871 | ip_hdr(skb)->daddr, | |
3872 | skb->len - off, | |
3873 | ip_hdr(skb)->protocol, 0); | |
3874 | err = 0; | |
3875 | ||
3876 | out: | |
3877 | return err; | |
3878 | } | |
3879 | ||
3880 | /* This value should be large enough to cover a tagged ethernet header plus | |
3881 | * an IPv6 header, all options, and a maximal TCP or UDP header. | |
3882 | */ | |
3883 | #define MAX_IPV6_HDR_LEN 256 | |
3884 | ||
3885 | #define OPT_HDR(type, skb, off) \ | |
3886 | (type *)(skb_network_header(skb) + (off)) | |
3887 | ||
3888 | static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) | |
3889 | { | |
3890 | int err; | |
3891 | u8 nexthdr; | |
3892 | unsigned int off; | |
3893 | unsigned int len; | |
3894 | bool fragment; | |
3895 | bool done; | |
3896 | __sum16 *csum; | |
3897 | ||
3898 | fragment = false; | |
3899 | done = false; | |
3900 | ||
3901 | off = sizeof(struct ipv6hdr); | |
3902 | ||
3903 | err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); | |
3904 | if (err < 0) | |
3905 | goto out; | |
3906 | ||
3907 | nexthdr = ipv6_hdr(skb)->nexthdr; | |
3908 | ||
3909 | len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); | |
3910 | while (off <= len && !done) { | |
3911 | switch (nexthdr) { | |
3912 | case IPPROTO_DSTOPTS: | |
3913 | case IPPROTO_HOPOPTS: | |
3914 | case IPPROTO_ROUTING: { | |
3915 | struct ipv6_opt_hdr *hp; | |
3916 | ||
3917 | err = skb_maybe_pull_tail(skb, | |
3918 | off + | |
3919 | sizeof(struct ipv6_opt_hdr), | |
3920 | MAX_IPV6_HDR_LEN); | |
3921 | if (err < 0) | |
3922 | goto out; | |
3923 | ||
3924 | hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); | |
3925 | nexthdr = hp->nexthdr; | |
3926 | off += ipv6_optlen(hp); | |
3927 | break; | |
3928 | } | |
3929 | case IPPROTO_AH: { | |
3930 | struct ip_auth_hdr *hp; | |
3931 | ||
3932 | err = skb_maybe_pull_tail(skb, | |
3933 | off + | |
3934 | sizeof(struct ip_auth_hdr), | |
3935 | MAX_IPV6_HDR_LEN); | |
3936 | if (err < 0) | |
3937 | goto out; | |
3938 | ||
3939 | hp = OPT_HDR(struct ip_auth_hdr, skb, off); | |
3940 | nexthdr = hp->nexthdr; | |
3941 | off += ipv6_authlen(hp); | |
3942 | break; | |
3943 | } | |
3944 | case IPPROTO_FRAGMENT: { | |
3945 | struct frag_hdr *hp; | |
3946 | ||
3947 | err = skb_maybe_pull_tail(skb, | |
3948 | off + | |
3949 | sizeof(struct frag_hdr), | |
3950 | MAX_IPV6_HDR_LEN); | |
3951 | if (err < 0) | |
3952 | goto out; | |
3953 | ||
3954 | hp = OPT_HDR(struct frag_hdr, skb, off); | |
3955 | ||
3956 | if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) | |
3957 | fragment = true; | |
3958 | ||
3959 | nexthdr = hp->nexthdr; | |
3960 | off += sizeof(struct frag_hdr); | |
3961 | break; | |
3962 | } | |
3963 | default: | |
3964 | done = true; | |
3965 | break; | |
3966 | } | |
3967 | } | |
3968 | ||
3969 | err = -EPROTO; | |
3970 | ||
3971 | if (!done || fragment) | |
3972 | goto out; | |
3973 | ||
3974 | csum = skb_checksum_setup_ip(skb, nexthdr, off); | |
3975 | if (IS_ERR(csum)) | |
3976 | return PTR_ERR(csum); | |
3977 | ||
3978 | if (recalculate) | |
3979 | *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, | |
3980 | &ipv6_hdr(skb)->daddr, | |
3981 | skb->len - off, nexthdr, 0); | |
3982 | err = 0; | |
3983 | ||
3984 | out: | |
3985 | return err; | |
3986 | } | |
3987 | ||
3988 | /** | |
3989 | * skb_checksum_setup - set up partial checksum offset | |
3990 | * @skb: the skb to set up | |
3991 | * @recalculate: if true the pseudo-header checksum will be recalculated | |
3992 | */ | |
3993 | int skb_checksum_setup(struct sk_buff *skb, bool recalculate) | |
3994 | { | |
3995 | int err; | |
3996 | ||
3997 | switch (skb->protocol) { | |
3998 | case htons(ETH_P_IP): | |
3999 | err = skb_checksum_setup_ipv4(skb, recalculate); | |
4000 | break; | |
4001 | ||
4002 | case htons(ETH_P_IPV6): | |
4003 | err = skb_checksum_setup_ipv6(skb, recalculate); | |
4004 | break; | |
4005 | ||
4006 | default: | |
4007 | err = -EPROTO; | |
4008 | break; | |
4009 | } | |
4010 | ||
4011 | return err; | |
4012 | } | |
4013 | EXPORT_SYMBOL(skb_checksum_setup); | |
4014 | ||
4015 | /** | |
4016 | * skb_checksum_maybe_trim - maybe trims the given skb | |
4017 | * @skb: the skb to check | |
4018 | * @transport_len: the data length beyond the network header | |
4019 | * | |
4020 | * Checks whether the given skb has data beyond the given transport length. | |
4021 | * If so, returns a cloned skb trimmed to this transport length. | |
4022 | * Otherwise returns the provided skb. Returns NULL in error cases | |
4023 | * (e.g. transport_len exceeds skb length or out-of-memory). | |
4024 | * | |
4025 | * Caller needs to set the skb transport header and release the returned skb. | |
4026 | * Provided skb is consumed. | |
4027 | */ | |
4028 | static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, | |
4029 | unsigned int transport_len) | |
4030 | { | |
4031 | struct sk_buff *skb_chk; | |
4032 | unsigned int len = skb_transport_offset(skb) + transport_len; | |
4033 | int ret; | |
4034 | ||
4035 | if (skb->len < len) { | |
4036 | kfree_skb(skb); | |
4037 | return NULL; | |
4038 | } else if (skb->len == len) { | |
4039 | return skb; | |
4040 | } | |
4041 | ||
4042 | skb_chk = skb_clone(skb, GFP_ATOMIC); | |
4043 | kfree_skb(skb); | |
4044 | ||
4045 | if (!skb_chk) | |
4046 | return NULL; | |
4047 | ||
4048 | ret = pskb_trim_rcsum(skb_chk, len); | |
4049 | if (ret) { | |
4050 | kfree_skb(skb_chk); | |
4051 | return NULL; | |
4052 | } | |
4053 | ||
4054 | return skb_chk; | |
4055 | } | |
4056 | ||
4057 | /** | |
4058 | * skb_checksum_trimmed - validate checksum of an skb | |
4059 | * @skb: the skb to check | |
4060 | * @transport_len: the data length beyond the network header | |
4061 | * @skb_chkf: checksum function to use | |
4062 | * | |
4063 | * Applies the given checksum function skb_chkf to the provided skb. | |
4064 | * Returns a checked and maybe trimmed skb. Returns NULL on error. | |
4065 | * | |
4066 | * If the skb has data beyond the given transport length, then a | |
4067 | * trimmed & cloned skb is checked and returned. | |
4068 | * | |
4069 | * Caller needs to set the skb transport header and release the returned skb. | |
4070 | * Provided skb is consumed. | |
4071 | */ | |
4072 | struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, | |
4073 | unsigned int transport_len, | |
4074 | __sum16(*skb_chkf)(struct sk_buff *skb)) | |
4075 | { | |
4076 | struct sk_buff *skb_chk; | |
4077 | unsigned int offset = skb_transport_offset(skb); | |
4078 | __sum16 ret; | |
4079 | ||
4080 | skb_chk = skb_checksum_maybe_trim(skb, transport_len); | |
4081 | if (!skb_chk) | |
4082 | return NULL; | |
4083 | ||
4084 | if (!pskb_may_pull(skb_chk, offset)) { | |
4085 | kfree_skb(skb_chk); | |
4086 | return NULL; | |
4087 | } | |
4088 | ||
4089 | __skb_pull(skb_chk, offset); | |
4090 | ret = skb_chkf(skb_chk); | |
4091 | __skb_push(skb_chk, offset); | |
4092 | ||
4093 | if (ret) { | |
4094 | kfree_skb(skb_chk); | |
4095 | return NULL; | |
4096 | } | |
4097 | ||
4098 | return skb_chk; | |
4099 | } | |
4100 | EXPORT_SYMBOL(skb_checksum_trimmed); | |
4101 | ||
4102 | void __skb_warn_lro_forwarding(const struct sk_buff *skb) | |
4103 | { | |
4104 | net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", | |
4105 | skb->dev->name); | |
4106 | } | |
4107 | EXPORT_SYMBOL(__skb_warn_lro_forwarding); | |
4108 | ||
4109 | void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) | |
4110 | { | |
4111 | if (head_stolen) { | |
4112 | skb_release_head_state(skb); | |
4113 | kmem_cache_free(skbuff_head_cache, skb); | |
4114 | } else { | |
4115 | __kfree_skb(skb); | |
4116 | } | |
4117 | } | |
4118 | EXPORT_SYMBOL(kfree_skb_partial); | |
4119 | ||
4120 | /** | |
4121 | * skb_try_coalesce - try to merge skb to prior one | |
4122 | * @to: prior buffer | |
4123 | * @from: buffer to add | |
4124 | * @fragstolen: pointer to boolean | |
4125 | * @delta_truesize: how much more was allocated than was requested | |
4126 | */ | |
4127 | bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, | |
4128 | bool *fragstolen, int *delta_truesize) | |
4129 | { | |
4130 | int i, delta, len = from->len; | |
4131 | ||
4132 | *fragstolen = false; | |
4133 | ||
4134 | if (skb_cloned(to)) | |
4135 | return false; | |
4136 | ||
4137 | if (len <= skb_tailroom(to)) { | |
4138 | if (len) | |
4139 | BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); | |
4140 | *delta_truesize = 0; | |
4141 | return true; | |
4142 | } | |
4143 | ||
4144 | if (skb_has_frag_list(to) || skb_has_frag_list(from)) | |
4145 | return false; | |
4146 | ||
4147 | if (skb_headlen(from) != 0) { | |
4148 | struct page *page; | |
4149 | unsigned int offset; | |
4150 | ||
4151 | if (skb_shinfo(to)->nr_frags + | |
4152 | skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) | |
4153 | return false; | |
4154 | ||
4155 | if (skb_head_is_locked(from)) | |
4156 | return false; | |
4157 | ||
4158 | delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); | |
4159 | ||
4160 | page = virt_to_head_page(from->head); | |
4161 | offset = from->data - (unsigned char *)page_address(page); | |
4162 | ||
4163 | skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, | |
4164 | page, offset, skb_headlen(from)); | |
4165 | *fragstolen = true; | |
4166 | } else { | |
4167 | if (skb_shinfo(to)->nr_frags + | |
4168 | skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) | |
4169 | return false; | |
4170 | ||
4171 | delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); | |
4172 | } | |
4173 | ||
4174 | WARN_ON_ONCE(delta < len); | |
4175 | ||
4176 | memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, | |
4177 | skb_shinfo(from)->frags, | |
4178 | skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); | |
4179 | skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; | |
4180 | ||
4181 | if (!skb_cloned(from)) | |
4182 | skb_shinfo(from)->nr_frags = 0; | |
4183 | ||
4184 | /* if the skb is not cloned this does nothing | |
4185 | * since we set nr_frags to 0. | |
4186 | */ | |
4187 | for (i = 0; i < skb_shinfo(from)->nr_frags; i++) | |
4188 | skb_frag_ref(from, i); | |
4189 | ||
4190 | to->truesize += delta; | |
4191 | to->len += len; | |
4192 | to->data_len += len; | |
4193 | ||
4194 | *delta_truesize = delta; | |
4195 | return true; | |
4196 | } | |
4197 | EXPORT_SYMBOL(skb_try_coalesce); | |
4198 | ||
4199 | /** | |
4200 | * skb_scrub_packet - scrub an skb | |
4201 | * | |
4202 | * @skb: buffer to clean | |
4203 | * @xnet: packet is crossing netns | |
4204 | * | |
4205 | * skb_scrub_packet can be used after encapsulating or decapsulting a packet | |
4206 | * into/from a tunnel. Some information have to be cleared during these | |
4207 | * operations. | |
4208 | * skb_scrub_packet can also be used to clean a skb before injecting it in | |
4209 | * another namespace (@xnet == true). We have to clear all information in the | |
4210 | * skb that could impact namespace isolation. | |
4211 | */ | |
4212 | void skb_scrub_packet(struct sk_buff *skb, bool xnet) | |
4213 | { | |
4214 | skb->tstamp.tv64 = 0; | |
4215 | skb->pkt_type = PACKET_HOST; | |
4216 | skb->skb_iif = 0; | |
4217 | skb->ignore_df = 0; | |
4218 | skb_dst_drop(skb); | |
4219 | skb_sender_cpu_clear(skb); | |
4220 | secpath_reset(skb); | |
4221 | nf_reset(skb); | |
4222 | nf_reset_trace(skb); | |
4223 | ||
4224 | if (!xnet) | |
4225 | return; | |
4226 | ||
4227 | skb_orphan(skb); | |
4228 | skb->mark = 0; | |
4229 | } | |
4230 | EXPORT_SYMBOL_GPL(skb_scrub_packet); | |
4231 | ||
4232 | /** | |
4233 | * skb_gso_transport_seglen - Return length of individual segments of a gso packet | |
4234 | * | |
4235 | * @skb: GSO skb | |
4236 | * | |
4237 | * skb_gso_transport_seglen is used to determine the real size of the | |
4238 | * individual segments, including Layer4 headers (TCP/UDP). | |
4239 | * | |
4240 | * The MAC/L2 or network (IP, IPv6) headers are not accounted for. | |
4241 | */ | |
4242 | unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) | |
4243 | { | |
4244 | const struct skb_shared_info *shinfo = skb_shinfo(skb); | |
4245 | unsigned int thlen = 0; | |
4246 | ||
4247 | if (skb->encapsulation) { | |
4248 | thlen = skb_inner_transport_header(skb) - | |
4249 | skb_transport_header(skb); | |
4250 | ||
4251 | if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) | |
4252 | thlen += inner_tcp_hdrlen(skb); | |
4253 | } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { | |
4254 | thlen = tcp_hdrlen(skb); | |
4255 | } | |
4256 | /* UFO sets gso_size to the size of the fragmentation | |
4257 | * payload, i.e. the size of the L4 (UDP) header is already | |
4258 | * accounted for. | |
4259 | */ | |
4260 | return thlen + shinfo->gso_size; | |
4261 | } | |
4262 | EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); | |
4263 | ||
4264 | static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) | |
4265 | { | |
4266 | if (skb_cow(skb, skb_headroom(skb)) < 0) { | |
4267 | kfree_skb(skb); | |
4268 | return NULL; | |
4269 | } | |
4270 | ||
4271 | memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); | |
4272 | skb->mac_header += VLAN_HLEN; | |
4273 | return skb; | |
4274 | } | |
4275 | ||
4276 | struct sk_buff *skb_vlan_untag(struct sk_buff *skb) | |
4277 | { | |
4278 | struct vlan_hdr *vhdr; | |
4279 | u16 vlan_tci; | |
4280 | ||
4281 | if (unlikely(skb_vlan_tag_present(skb))) { | |
4282 | /* vlan_tci is already set-up so leave this for another time */ | |
4283 | return skb; | |
4284 | } | |
4285 | ||
4286 | skb = skb_share_check(skb, GFP_ATOMIC); | |
4287 | if (unlikely(!skb)) | |
4288 | goto err_free; | |
4289 | ||
4290 | if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) | |
4291 | goto err_free; | |
4292 | ||
4293 | vhdr = (struct vlan_hdr *)skb->data; | |
4294 | vlan_tci = ntohs(vhdr->h_vlan_TCI); | |
4295 | __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); | |
4296 | ||
4297 | skb_pull_rcsum(skb, VLAN_HLEN); | |
4298 | vlan_set_encap_proto(skb, vhdr); | |
4299 | ||
4300 | skb = skb_reorder_vlan_header(skb); | |
4301 | if (unlikely(!skb)) | |
4302 | goto err_free; | |
4303 | ||
4304 | skb_reset_network_header(skb); | |
4305 | skb_reset_transport_header(skb); | |
4306 | skb_reset_mac_len(skb); | |
4307 | ||
4308 | return skb; | |
4309 | ||
4310 | err_free: | |
4311 | kfree_skb(skb); | |
4312 | return NULL; | |
4313 | } | |
4314 | EXPORT_SYMBOL(skb_vlan_untag); | |
4315 | ||
4316 | int skb_ensure_writable(struct sk_buff *skb, int write_len) | |
4317 | { | |
4318 | if (!pskb_may_pull(skb, write_len)) | |
4319 | return -ENOMEM; | |
4320 | ||
4321 | if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) | |
4322 | return 0; | |
4323 | ||
4324 | return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); | |
4325 | } | |
4326 | EXPORT_SYMBOL(skb_ensure_writable); | |
4327 | ||
4328 | /* remove VLAN header from packet and update csum accordingly. */ | |
4329 | static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) | |
4330 | { | |
4331 | struct vlan_hdr *vhdr; | |
4332 | unsigned int offset = skb->data - skb_mac_header(skb); | |
4333 | int err; | |
4334 | ||
4335 | __skb_push(skb, offset); | |
4336 | err = skb_ensure_writable(skb, VLAN_ETH_HLEN); | |
4337 | if (unlikely(err)) | |
4338 | goto pull; | |
4339 | ||
4340 | skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); | |
4341 | ||
4342 | vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); | |
4343 | *vlan_tci = ntohs(vhdr->h_vlan_TCI); | |
4344 | ||
4345 | memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); | |
4346 | __skb_pull(skb, VLAN_HLEN); | |
4347 | ||
4348 | vlan_set_encap_proto(skb, vhdr); | |
4349 | skb->mac_header += VLAN_HLEN; | |
4350 | ||
4351 | if (skb_network_offset(skb) < ETH_HLEN) | |
4352 | skb_set_network_header(skb, ETH_HLEN); | |
4353 | ||
4354 | skb_reset_mac_len(skb); | |
4355 | pull: | |
4356 | __skb_pull(skb, offset); | |
4357 | ||
4358 | return err; | |
4359 | } | |
4360 | ||
4361 | int skb_vlan_pop(struct sk_buff *skb) | |
4362 | { | |
4363 | u16 vlan_tci; | |
4364 | __be16 vlan_proto; | |
4365 | int err; | |
4366 | ||
4367 | if (likely(skb_vlan_tag_present(skb))) { | |
4368 | skb->vlan_tci = 0; | |
4369 | } else { | |
4370 | if (unlikely((skb->protocol != htons(ETH_P_8021Q) && | |
4371 | skb->protocol != htons(ETH_P_8021AD)) || | |
4372 | skb->len < VLAN_ETH_HLEN)) | |
4373 | return 0; | |
4374 | ||
4375 | err = __skb_vlan_pop(skb, &vlan_tci); | |
4376 | if (err) | |
4377 | return err; | |
4378 | } | |
4379 | /* move next vlan tag to hw accel tag */ | |
4380 | if (likely((skb->protocol != htons(ETH_P_8021Q) && | |
4381 | skb->protocol != htons(ETH_P_8021AD)) || | |
4382 | skb->len < VLAN_ETH_HLEN)) | |
4383 | return 0; | |
4384 | ||
4385 | vlan_proto = skb->protocol; | |
4386 | err = __skb_vlan_pop(skb, &vlan_tci); | |
4387 | if (unlikely(err)) | |
4388 | return err; | |
4389 | ||
4390 | __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); | |
4391 | return 0; | |
4392 | } | |
4393 | EXPORT_SYMBOL(skb_vlan_pop); | |
4394 | ||
4395 | int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) | |
4396 | { | |
4397 | if (skb_vlan_tag_present(skb)) { | |
4398 | unsigned int offset = skb->data - skb_mac_header(skb); | |
4399 | int err; | |
4400 | ||
4401 | /* __vlan_insert_tag expect skb->data pointing to mac header. | |
4402 | * So change skb->data before calling it and change back to | |
4403 | * original position later | |
4404 | */ | |
4405 | __skb_push(skb, offset); | |
4406 | err = __vlan_insert_tag(skb, skb->vlan_proto, | |
4407 | skb_vlan_tag_get(skb)); | |
4408 | if (err) | |
4409 | return err; | |
4410 | skb->protocol = skb->vlan_proto; | |
4411 | skb->mac_len += VLAN_HLEN; | |
4412 | __skb_pull(skb, offset); | |
4413 | ||
4414 | if (skb->ip_summed == CHECKSUM_COMPLETE) | |
4415 | skb->csum = csum_add(skb->csum, csum_partial(skb->data | |
4416 | + (2 * ETH_ALEN), VLAN_HLEN, 0)); | |
4417 | } | |
4418 | __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); | |
4419 | return 0; | |
4420 | } | |
4421 | EXPORT_SYMBOL(skb_vlan_push); | |
4422 | ||
4423 | /** | |
4424 | * alloc_skb_with_frags - allocate skb with page frags | |
4425 | * | |
4426 | * @header_len: size of linear part | |
4427 | * @data_len: needed length in frags | |
4428 | * @max_page_order: max page order desired. | |
4429 | * @errcode: pointer to error code if any | |
4430 | * @gfp_mask: allocation mask | |
4431 | * | |
4432 | * This can be used to allocate a paged skb, given a maximal order for frags. | |
4433 | */ | |
4434 | struct sk_buff *alloc_skb_with_frags(unsigned long header_len, | |
4435 | unsigned long data_len, | |
4436 | int max_page_order, | |
4437 | int *errcode, | |
4438 | gfp_t gfp_mask) | |
4439 | { | |
4440 | int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; | |
4441 | unsigned long chunk; | |
4442 | struct sk_buff *skb; | |
4443 | struct page *page; | |
4444 | gfp_t gfp_head; | |
4445 | int i; | |
4446 | ||
4447 | *errcode = -EMSGSIZE; | |
4448 | /* Note this test could be relaxed, if we succeed to allocate | |
4449 | * high order pages... | |
4450 | */ | |
4451 | if (npages > MAX_SKB_FRAGS) | |
4452 | return NULL; | |
4453 | ||
4454 | gfp_head = gfp_mask; | |
4455 | if (gfp_head & __GFP_WAIT) | |
4456 | gfp_head |= __GFP_REPEAT; | |
4457 | ||
4458 | *errcode = -ENOBUFS; | |
4459 | skb = alloc_skb(header_len, gfp_head); | |
4460 | if (!skb) | |
4461 | return NULL; | |
4462 | ||
4463 | skb->truesize += npages << PAGE_SHIFT; | |
4464 | ||
4465 | for (i = 0; npages > 0; i++) { | |
4466 | int order = max_page_order; | |
4467 | ||
4468 | while (order) { | |
4469 | if (npages >= 1 << order) { | |
4470 | page = alloc_pages((gfp_mask & ~__GFP_WAIT) | | |
4471 | __GFP_COMP | | |
4472 | __GFP_NOWARN | | |
4473 | __GFP_NORETRY, | |
4474 | order); | |
4475 | if (page) | |
4476 | goto fill_page; | |
4477 | /* Do not retry other high order allocations */ | |
4478 | order = 1; | |
4479 | max_page_order = 0; | |
4480 | } | |
4481 | order--; | |
4482 | } | |
4483 | page = alloc_page(gfp_mask); | |
4484 | if (!page) | |
4485 | goto failure; | |
4486 | fill_page: | |
4487 | chunk = min_t(unsigned long, data_len, | |
4488 | PAGE_SIZE << order); | |
4489 | skb_fill_page_desc(skb, i, page, 0, chunk); | |
4490 | data_len -= chunk; | |
4491 | npages -= 1 << order; | |
4492 | } | |
4493 | return skb; | |
4494 | ||
4495 | failure: | |
4496 | kfree_skb(skb); | |
4497 | return NULL; | |
4498 | } | |
4499 | EXPORT_SYMBOL(alloc_skb_with_frags); |