]> Git Repo - linux.git/blame - net/packet/af_packet.c
Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <[email protected]>
10 * Alan Cox, <[email protected]>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <[email protected]>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
8ec56fc3
AD
278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279 struct net_device *sb_dev)
d346a3fa 280{
8ec56fc3 281 return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
d346a3fa
DB
282}
283
865b03f2 284static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 285{
865b03f2 286 struct net_device *dev = skb->dev;
0fd5d57b
DB
287 const struct net_device_ops *ops = dev->netdev_ops;
288 u16 queue_index;
289
290 if (ops->ndo_select_queue) {
291 queue_index = ops->ndo_select_queue(dev, skb, NULL,
292 __packet_pick_tx_queue);
293 queue_index = netdev_cap_txqueue(dev, queue_index);
294 } else {
8ec56fc3 295 queue_index = __packet_pick_tx_queue(dev, skb, NULL);
0fd5d57b
DB
296 }
297
865b03f2 298 return queue_index;
0fd5d57b
DB
299}
300
a6361f0c 301/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
302 * or from a context in which asynchronous accesses to the packet
303 * socket is not possible (packet_create()).
304 */
a6361f0c 305static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
306{
307 struct packet_sock *po = pkt_sk(sk);
e40526cb 308
ce06b03e 309 if (!po->running) {
66e56cd4 310 if (po->fanout)
dc99f600 311 __fanout_link(sk, po);
66e56cd4 312 else
dc99f600 313 dev_add_pack(&po->prot_hook);
e40526cb 314
ce06b03e
DM
315 sock_hold(sk);
316 po->running = 1;
317 }
318}
319
a6361f0c
WB
320static void register_prot_hook(struct sock *sk)
321{
322 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
323 __register_prot_hook(sk);
324}
325
326/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
327 * the po->bind_lock and do a synchronize_net to make sure no
328 * asynchronous packet processing paths still refer to the elements
329 * of po->prot_hook. If the sync parameter is false, it is the
330 * callers responsibility to take care of this.
331 */
332static void __unregister_prot_hook(struct sock *sk, bool sync)
333{
334 struct packet_sock *po = pkt_sk(sk);
335
a6361f0c
WB
336 lockdep_assert_held_once(&po->bind_lock);
337
ce06b03e 338 po->running = 0;
66e56cd4
DB
339
340 if (po->fanout)
dc99f600 341 __fanout_unlink(sk, po);
66e56cd4 342 else
dc99f600 343 __dev_remove_pack(&po->prot_hook);
e40526cb 344
ce06b03e
DM
345 __sock_put(sk);
346
347 if (sync) {
348 spin_unlock(&po->bind_lock);
349 synchronize_net();
350 spin_lock(&po->bind_lock);
351 }
352}
353
354static void unregister_prot_hook(struct sock *sk, bool sync)
355{
356 struct packet_sock *po = pkt_sk(sk);
357
358 if (po->running)
359 __unregister_prot_hook(sk, sync);
360}
361
6e58040b 362static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
363{
364 if (is_vmalloc_addr(addr))
365 return vmalloc_to_page(addr);
366 return virt_to_page(addr);
367}
368
69e3c75f 369static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 370{
184f489e 371 union tpacket_uhdr h;
1da177e4 372
69e3c75f 373 h.raw = frame;
bbd6ef87
PM
374 switch (po->tp_version) {
375 case TPACKET_V1:
69e3c75f 376 h.h1->tp_status = status;
0af55bb5 377 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
378 break;
379 case TPACKET_V2:
69e3c75f 380 h.h2->tp_status = status;
0af55bb5 381 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 382 break;
f6fb8f10 383 case TPACKET_V3:
7f953ab2
SV
384 h.h3->tp_status = status;
385 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386 break;
69e3c75f 387 default:
f6fb8f10 388 WARN(1, "TPACKET version not supported.\n");
69e3c75f 389 BUG();
bbd6ef87 390 }
69e3c75f
JB
391
392 smp_wmb();
bbd6ef87
PM
393}
394
69e3c75f 395static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 396{
184f489e 397 union tpacket_uhdr h;
bbd6ef87 398
69e3c75f
JB
399 smp_rmb();
400
bbd6ef87
PM
401 h.raw = frame;
402 switch (po->tp_version) {
403 case TPACKET_V1:
0af55bb5 404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 405 return h.h1->tp_status;
bbd6ef87 406 case TPACKET_V2:
0af55bb5 407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 408 return h.h2->tp_status;
f6fb8f10 409 case TPACKET_V3:
7f953ab2
SV
410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411 return h.h3->tp_status;
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
414 BUG();
415 return 0;
bbd6ef87 416 }
1da177e4 417}
69e3c75f 418
b9c32fb2
DB
419static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
420 unsigned int flags)
7a51384c
DB
421{
422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423
68a360e8
WB
424 if (shhwtstamps &&
425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
427 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
428
429 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 430 return TP_STATUS_TS_SOFTWARE;
7a51384c 431
b9c32fb2 432 return 0;
7a51384c
DB
433}
434
b9c32fb2
DB
435static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
436 struct sk_buff *skb)
2e31396f
WB
437{
438 union tpacket_uhdr h;
439 struct timespec ts;
b9c32fb2 440 __u32 ts_status;
2e31396f 441
b9c32fb2
DB
442 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443 return 0;
2e31396f
WB
444
445 h.raw = frame;
446 switch (po->tp_version) {
447 case TPACKET_V1:
448 h.h1->tp_sec = ts.tv_sec;
449 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
450 break;
451 case TPACKET_V2:
452 h.h2->tp_sec = ts.tv_sec;
453 h.h2->tp_nsec = ts.tv_nsec;
454 break;
455 case TPACKET_V3:
57ea884b
DB
456 h.h3->tp_sec = ts.tv_sec;
457 h.h3->tp_nsec = ts.tv_nsec;
458 break;
2e31396f
WB
459 default:
460 WARN(1, "TPACKET version not supported.\n");
461 BUG();
462 }
463
464 /* one flush is safe, as both fields always lie on the same cacheline */
465 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
466 smp_wmb();
b9c32fb2
DB
467
468 return ts_status;
2e31396f
WB
469}
470
69e3c75f
JB
471static void *packet_lookup_frame(struct packet_sock *po,
472 struct packet_ring_buffer *rb,
473 unsigned int position,
474 int status)
475{
476 unsigned int pg_vec_pos, frame_offset;
184f489e 477 union tpacket_uhdr h;
69e3c75f
JB
478
479 pg_vec_pos = position / rb->frames_per_block;
480 frame_offset = position % rb->frames_per_block;
481
0e3125c7
NH
482 h.raw = rb->pg_vec[pg_vec_pos].buffer +
483 (frame_offset * rb->frame_size);
69e3c75f
JB
484
485 if (status != __packet_get_status(po, h.raw))
486 return NULL;
487
488 return h.raw;
489}
490
eea49cc9 491static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
492 struct packet_ring_buffer *rb,
493 int status)
494{
495 return packet_lookup_frame(po, rb, rb->head, status);
496}
497
bc59ba39 498static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 499{
500 del_timer_sync(&pkc->retire_blk_timer);
501}
502
503static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 504 struct sk_buff_head *rb_queue)
505{
bc59ba39 506 struct tpacket_kbdq_core *pkc;
f6fb8f10 507
73d0fcf2 508 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 509
ec6f809f 510 spin_lock_bh(&rb_queue->lock);
f6fb8f10 511 pkc->delete_blk_timer = 1;
ec6f809f 512 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 513
514 prb_del_retire_blk_timer(pkc);
515}
516
e8e85cc5 517static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 518{
bc59ba39 519 struct tpacket_kbdq_core *pkc;
f6fb8f10 520
e8e85cc5 521 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
522 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
523 0);
524 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 532 struct ethtool_link_ksettings ecmd;
4bc71cb9 533 int err;
f6fb8f10 534
4bc71cb9
JP
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
f6fb8f10 539 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 540 }
7cad1bac 541 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
542 rtnl_unlock();
543 if (!err) {
4bc71cb9
JP
544 /*
545 * If the link speed is so slow you don't really
546 * need to worry about perf anyways
547 */
7cad1bac
DD
548 if (ecmd.base.speed < SPEED_1000 ||
549 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 550 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 551 } else {
552 msec = 1;
7cad1bac 553 div = ecmd.base.speed / 1000;
f6fb8f10 554 }
555 }
556
557 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
558
559 if (div)
560 mbits /= div;
561
562 tmo = mbits * msec;
563
564 if (div)
565 return tmo+1;
566 return tmo;
567}
568
bc59ba39 569static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 570 union tpacket_req_u *req_u)
571{
572 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573}
574
575static void init_prb_bdqc(struct packet_sock *po,
576 struct packet_ring_buffer *rb,
577 struct pgv *pg_vec,
e8e85cc5 578 union tpacket_req_u *req_u)
f6fb8f10 579{
22781a5b 580 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 581 struct tpacket_block_desc *pbd;
f6fb8f10 582
583 memset(p1, 0x0, sizeof(*p1));
584
585 p1->knxt_seq_num = 1;
586 p1->pkbdq = pg_vec;
bc59ba39 587 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 588 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 589 p1->kblk_size = req_u->req3.tp_block_size;
590 p1->knum_blocks = req_u->req3.tp_block_nr;
591 p1->hdrlen = po->tp_hdrlen;
592 p1->version = po->tp_version;
593 p1->last_kactive_blk_num = 0;
ee80fbf3 594 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 595 if (req_u->req3.tp_retire_blk_tov)
596 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
597 else
598 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
599 req_u->req3.tp_block_size);
600 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
601 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
602
dc808110 603 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 604 prb_init_ft_ops(p1, req_u);
e8e85cc5 605 prb_setup_retire_blk_timer(po);
f6fb8f10 606 prb_open_block(p1, pbd);
607}
608
609/* Do NOT update the last_blk_num first.
610 * Assumes sk_buff_head lock is held.
611 */
bc59ba39 612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619/*
620 * Timer logic:
621 * 1) We refresh the timer only when we open a block.
622 * By doing this we don't waste cycles refreshing the timer
623 * on packet-by-packet basis.
624 *
625 * With a 1MB block-size, on a 1Gbps line, it will take
626 * i) ~8 ms to fill a block + ii) memcpy etc.
627 * In this cut we are not accounting for the memcpy time.
628 *
629 * So, if the user sets the 'tmo' to 10ms then the timer
630 * will never fire while the block is still getting filled
631 * (which is what we want). However, the user could choose
632 * to close a block early and that's fine.
633 *
634 * But when the timer does fire, we check whether or not to refresh it.
635 * Since the tmo granularity is in msecs, it is not too expensive
636 * to refresh the timer, lets say every '8' msecs.
637 * Either the user can set the 'tmo' or we can derive it based on
638 * a) line-speed and b) block-size.
639 * prb_calc_retire_blk_tmo() calculates the tmo.
640 *
641 */
17bfd8c8 642static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 643{
17bfd8c8
KC
644 struct packet_sock *po =
645 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 646 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 647 unsigned int frozen;
bc59ba39 648 struct tpacket_block_desc *pbd;
f6fb8f10 649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658 /* We only need to plug the race when the block is partially filled.
659 * tpacket_rcv:
660 * lock(); increment BLOCK_NUM_PKTS; unlock()
661 * copy_bits() is in progress ...
662 * timer fires on other cpu:
663 * we can't retire the current block because copy_bits
664 * is in progress.
665 *
666 */
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669 /* Waiting for skb_copy_bits to finish... */
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
41a50d62
AD
676 if (!BLOCK_NUM_PKTS(pbd)) {
677 /* An empty block. Just refresh the timer. */
678 goto refresh_timer;
679 }
f6fb8f10 680 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
681 if (!prb_dispatch_next_block(pkc, po))
682 goto refresh_timer;
683 else
684 goto out;
685 } else {
686 /* Case 1. Queue was frozen because user-space was
687 * lagging behind.
688 */
878cd3ba 689 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 690 /*
691 * Ok, user-space is still behind.
692 * So just refresh the timer.
693 */
694 goto refresh_timer;
695 } else {
696 /* Case 2. queue was frozen,user-space caught up,
697 * now the link went idle && the timer fired.
698 * We don't have a block to close.So we open this
699 * block and restart the timer.
700 * opening a block thaws the queue,restarts timer
701 * Thawing/timer-refresh is a side effect.
702 */
703 prb_open_block(pkc, pbd);
704 goto out;
705 }
706 }
707 }
708
709refresh_timer:
710 _prb_refresh_rx_retire_blk_timer(pkc);
711
712out:
713 spin_unlock(&po->sk.sk_receive_queue.lock);
714}
715
eea49cc9 716static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 717 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 718{
719 /* Flush everything minus the block header */
720
721#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
722 u8 *start, *end;
723
724 start = (u8 *)pbd1;
725
726 /* Skip the block header(we know header WILL fit in 4K) */
727 start += PAGE_SIZE;
728
729 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
730 for (; start < end; start += PAGE_SIZE)
731 flush_dcache_page(pgv_to_page(start));
732
733 smp_wmb();
734#endif
735
736 /* Now update the block status. */
737
738 BLOCK_STATUS(pbd1) = status;
739
740 /* Flush the block header */
741
742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743 start = (u8 *)pbd1;
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748}
749
750/*
751 * Side effect:
752 *
753 * 1) flush the block
754 * 2) Increment active_blk_num
755 *
756 * Note:We DONT refresh the timer on purpose.
757 * Because almost always the next block will be opened.
758 */
bc59ba39 759static void prb_close_block(struct tpacket_kbdq_core *pkc1,
760 struct tpacket_block_desc *pbd1,
f6fb8f10 761 struct packet_sock *po, unsigned int stat)
762{
763 __u32 status = TP_STATUS_USER | stat;
764
765 struct tpacket3_hdr *last_pkt;
bc59ba39 766 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 767 struct sock *sk = &po->sk;
f6fb8f10 768
ee80fbf3 769 if (po->stats.stats3.tp_drops)
f6fb8f10 770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775 /* Get the ts of the last pkt */
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
41a50d62
AD
780 /* Ok, we tmo'd - so get the current time.
781 *
782 * It shouldn't really happen as we don't close empty
783 * blocks. See prb_retire_rx_blk_timer_expired().
784 */
f6fb8f10 785 struct timespec ts;
786 getnstimeofday(&ts);
787 h1->ts_last_pkt.ts_sec = ts.tv_sec;
788 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
789 }
790
791 smp_wmb();
792
793 /* Flush the block */
794 prb_flush_block(pkc1, pbd1, status);
795
da413eec
DC
796 sk->sk_data_ready(sk);
797
f6fb8f10 798 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799}
800
eea49cc9 801static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 802{
803 pkc->reset_pending_on_curr_blk = 0;
804}
805
806/*
807 * Side effect of opening a block:
808 *
809 * 1) prb_queue is thawed.
810 * 2) retire_blk_timer is refreshed.
811 *
812 */
bc59ba39 813static void prb_open_block(struct tpacket_kbdq_core *pkc1,
814 struct tpacket_block_desc *pbd1)
f6fb8f10 815{
816 struct timespec ts;
bc59ba39 817 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 818
819 smp_rmb();
820
8da3056c
DB
821 /* We could have just memset this but we will lose the
822 * flexibility of making the priv area sticky
823 */
f6fb8f10 824
8da3056c
DB
825 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
826 BLOCK_NUM_PKTS(pbd1) = 0;
827 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 828
8da3056c
DB
829 getnstimeofday(&ts);
830
831 h1->ts_first_pkt.ts_sec = ts.tv_sec;
832 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 833
8da3056c
DB
834 pkc1->pkblk_start = (char *)pbd1;
835 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836
837 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
838 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
839
840 pbd1->version = pkc1->version;
841 pkc1->prev = pkc1->nxt_offset;
842 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
843
844 prb_thaw_queue(pkc1);
845 _prb_refresh_rx_retire_blk_timer(pkc1);
846
847 smp_wmb();
f6fb8f10 848}
849
850/*
851 * Queue freeze logic:
852 * 1) Assume tp_block_nr = 8 blocks.
853 * 2) At time 't0', user opens Rx ring.
854 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
855 * 4) user-space is either sleeping or processing block '0'.
856 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
857 * it will close block-7,loop around and try to fill block '0'.
858 * call-flow:
859 * __packet_lookup_frame_in_block
860 * prb_retire_current_block()
861 * prb_dispatch_next_block()
862 * |->(BLOCK_STATUS == USER) evaluates to true
863 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
864 * 6) Now there are two cases:
865 * 6.1) Link goes idle right after the queue is frozen.
866 * But remember, the last open_block() refreshed the timer.
867 * When this timer expires,it will refresh itself so that we can
868 * re-open block-0 in near future.
869 * 6.2) Link is busy and keeps on receiving packets. This is a simple
870 * case and __packet_lookup_frame_in_block will check if block-0
871 * is free and can now be re-used.
872 */
eea49cc9 873static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 874 struct packet_sock *po)
875{
876 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 877 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 878}
879
880#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881
882/*
883 * If the next block is free then we will dispatch it
884 * and return a good offset.
885 * Else, we will freeze the queue.
886 * So, caller must check the return value.
887 */
bc59ba39 888static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 889 struct packet_sock *po)
890{
bc59ba39 891 struct tpacket_block_desc *pbd;
f6fb8f10 892
893 smp_rmb();
894
895 /* 1. Get current block num */
896 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
897
898 /* 2. If this block is currently in_use then freeze the queue */
899 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
900 prb_freeze_queue(pkc, po);
901 return NULL;
902 }
903
904 /*
905 * 3.
906 * open this block and return the offset where the first packet
907 * needs to get stored.
908 */
909 prb_open_block(pkc, pbd);
910 return (void *)pkc->nxt_offset;
911}
912
bc59ba39 913static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 914 struct packet_sock *po, unsigned int status)
915{
bc59ba39 916 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 917
918 /* retire/close the current block */
919 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
920 /*
921 * Plug the case where copy_bits() is in progress on
922 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
923 * have space to copy the pkt in the current block and
924 * called prb_retire_current_block()
925 *
926 * We don't need to worry about the TMO case because
927 * the timer-handler already handled this case.
928 */
929 if (!(status & TP_STATUS_BLK_TMO)) {
930 while (atomic_read(&pkc->blk_fill_in_prog)) {
931 /* Waiting for skb_copy_bits to finish... */
932 cpu_relax();
933 }
934 }
935 prb_close_block(pkc, pbd, po, status);
936 return;
937 }
f6fb8f10 938}
939
878cd3ba 940static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 941{
942 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943}
944
eea49cc9 945static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 946{
947 return pkc->reset_pending_on_curr_blk;
948}
949
eea49cc9 950static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 951{
bc59ba39 952 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 953 atomic_dec(&pkc->blk_fill_in_prog);
954}
955
eea49cc9 956static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 957 struct tpacket3_hdr *ppd)
958{
3958afa1 959 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 960}
961
eea49cc9 962static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 963 struct tpacket3_hdr *ppd)
964{
965 ppd->hv1.tp_rxhash = 0;
966}
967
eea49cc9 968static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 969 struct tpacket3_hdr *ppd)
970{
df8a39de
JP
971 if (skb_vlan_tag_present(pkc->skb)) {
972 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
973 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
974 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 975 } else {
9e67030a 976 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 977 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 978 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 979 }
980}
981
bc59ba39 982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
a0cdfcf3 985 ppd->hv1.tp_padding = 0;
f6fb8f10 986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
eea49cc9 994static void prb_fill_curr_block(char *curr,
bc59ba39 995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
f6fb8f10 997 unsigned int len)
998{
999 struct tpacket3_hdr *ppd;
1000
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 pkc->prev = curr;
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1009}
1010
1011/* Assumes caller has the sk->rx_queue.lock */
1012static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1014 int status,
1015 unsigned int len
1016 )
1017{
bc59ba39 1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
f6fb8f10 1020 char *curr, *end;
1021
e3192690 1022 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025 /* Queue is frozen when user space is lagging behind */
1026 if (prb_queue_frozen(pkc)) {
1027 /*
1028 * Check if that last block which caused the queue to freeze,
1029 * is still in_use by user-space.
1030 */
878cd3ba 1031 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1032 /* Can't record this packet */
1033 return NULL;
1034 } else {
1035 /*
1036 * Ok, the block was released by user-space.
1037 * Now let's open that block.
1038 * opening a block also thaws the queue.
1039 * Thawing is a side effect.
1040 */
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
e3192690 1048 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1049
1050 /* first try the current block */
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056 /* Ok, close the current block */
1057 prb_retire_current_block(pkc, po, 0);
1058
1059 /* Now, try to dispatch the next block */
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067 /*
1068 * No free blocks are available.user_space hasn't caught up yet.
1069 * Queue was just frozen and now this packet will get dropped.
1070 */
1071 return NULL;
1072}
1073
eea49cc9 1074static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, status, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
99aa3473 1090 return NULL;
f6fb8f10 1091 }
1092}
1093
eea49cc9 1094static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1095 struct packet_ring_buffer *rb,
77f65ebd 1096 unsigned int idx,
f6fb8f10 1097 int status)
1098{
bc59ba39 1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
eea49cc9 1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117/* Assumes caller has held the rx_queue.lock */
eea49cc9 1118static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
eea49cc9 1126static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
eea49cc9 1136static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
eea49cc9 1151static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1156 return packet_lookup_frame(po, rb, previous, status);
1157}
1158
eea49cc9 1159static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1160{
1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162}
1163
b0138408
DB
1164static void packet_inc_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_inc(*rb->pending_refcnt);
1167}
1168
1169static void packet_dec_pending(struct packet_ring_buffer *rb)
1170{
1171 this_cpu_dec(*rb->pending_refcnt);
1172}
1173
1174static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175{
1176 unsigned int refcnt = 0;
1177 int cpu;
1178
1179 /* We don't use pending refcount in rx_ring. */
1180 if (rb->pending_refcnt == NULL)
1181 return 0;
1182
1183 for_each_possible_cpu(cpu)
1184 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1185
1186 return refcnt;
1187}
1188
1189static int packet_alloc_pending(struct packet_sock *po)
1190{
1191 po->rx_ring.pending_refcnt = NULL;
1192
1193 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1194 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1195 return -ENOBUFS;
1196
1197 return 0;
1198}
1199
1200static void packet_free_pending(struct packet_sock *po)
1201{
1202 free_percpu(po->tx_ring.pending_refcnt);
1203}
1204
9954729b
WB
1205#define ROOM_POW_OFF 2
1206#define ROOM_NONE 0x0
1207#define ROOM_LOW 0x1
1208#define ROOM_NORMAL 0x2
1209
1210static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1211{
9954729b
WB
1212 int idx, len;
1213
1214 len = po->rx_ring.frame_max + 1;
1215 idx = po->rx_ring.head;
1216 if (pow_off)
1217 idx += len >> pow_off;
1218 if (idx >= len)
1219 idx -= len;
1220 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221}
1222
1223static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1224{
1225 int idx, len;
1226
1227 len = po->rx_ring.prb_bdqc.knum_blocks;
1228 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1229 if (pow_off)
1230 idx += len >> pow_off;
1231 if (idx >= len)
1232 idx -= len;
1233 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234}
77f65ebd 1235
2ccdbaa6 1236static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1237{
1238 struct sock *sk = &po->sk;
1239 int ret = ROOM_NONE;
1240
1241 if (po->prot_hook.func != tpacket_rcv) {
1242 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1243 - (skb ? skb->truesize : 0);
9954729b
WB
1244 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1245 return ROOM_NORMAL;
1246 else if (avail > 0)
1247 return ROOM_LOW;
1248 else
1249 return ROOM_NONE;
1250 }
77f65ebd 1251
9954729b
WB
1252 if (po->tp_version == TPACKET_V3) {
1253 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_v3_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 } else {
1258 if (__tpacket_has_room(po, ROOM_POW_OFF))
1259 ret = ROOM_NORMAL;
1260 else if (__tpacket_has_room(po, 0))
1261 ret = ROOM_LOW;
1262 }
2ccdbaa6
WB
1263
1264 return ret;
1265}
1266
1267static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1268{
1269 int ret;
1270 bool has_room;
1271
54d7c01d
WB
1272 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1273 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1274 has_room = ret == ROOM_NORMAL;
1275 if (po->pressure == has_room)
54d7c01d
WB
1276 po->pressure = !has_room;
1277 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1278
9954729b 1279 return ret;
77f65ebd
WB
1280}
1281
1da177e4
LT
1282static void packet_sock_destruct(struct sock *sk)
1283{
ed85b565
RC
1284 skb_queue_purge(&sk->sk_error_queue);
1285
547b792c 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1291 return;
1292 }
1293
17ab56a2 1294 sk_refcnt_debug_dec(sk);
1da177e4
LT
1295}
1296
3b3a5b0a
WB
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
1299 u32 rxhash;
1300 int i, count = 0;
1301
1302 rxhash = skb_get_hash(skb);
1303 for (i = 0; i < ROLLOVER_HLEN; i++)
1304 if (po->rollover->history[i] == rxhash)
1305 count++;
1306
1307 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1308 return count > (ROLLOVER_HLEN >> 1);
1309}
1310
77f65ebd
WB
1311static unsigned int fanout_demux_hash(struct packet_fanout *f,
1312 struct sk_buff *skb,
1313 unsigned int num)
dc99f600 1314{
eb70db87 1315 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1316}
1317
77f65ebd
WB
1318static unsigned int fanout_demux_lb(struct packet_fanout *f,
1319 struct sk_buff *skb,
1320 unsigned int num)
dc99f600 1321{
468479e6 1322 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1323
468479e6 1324 return val % num;
77f65ebd
WB
1325}
1326
1327static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1328 struct sk_buff *skb,
1329 unsigned int num)
1330{
1331 return smp_processor_id() % num;
dc99f600
DM
1332}
1333
5df0ddfb
DB
1334static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
f337db64 1338 return prandom_u32_max(num);
5df0ddfb
DB
1339}
1340
77f65ebd
WB
1341static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1342 struct sk_buff *skb,
ad377cab 1343 unsigned int idx, bool try_self,
77f65ebd 1344 unsigned int num)
95ec3eb4 1345{
4633c9e0 1346 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1347 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1348
0648ab70 1349 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1350
1351 if (try_self) {
1352 room = packet_rcv_has_room(po, skb);
1353 if (room == ROOM_NORMAL ||
1354 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1355 return idx;
4633c9e0 1356 po_skip = po;
3b3a5b0a 1357 }
ad377cab 1358
0648ab70 1359 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1360 do {
2ccdbaa6 1361 po_next = pkt_sk(f->arr[i]);
4633c9e0 1362 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1363 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1364 if (i != j)
0648ab70 1365 po->rollover->sock = i;
a9b63918
WB
1366 atomic_long_inc(&po->rollover->num);
1367 if (room == ROOM_LOW)
1368 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1369 return i;
1370 }
ad377cab 1371
77f65ebd
WB
1372 if (++i == num)
1373 i = 0;
1374 } while (i != j);
1375
a9b63918 1376 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1377 return idx;
1378}
1379
2d36097d
NH
1380static unsigned int fanout_demux_qm(struct packet_fanout *f,
1381 struct sk_buff *skb,
1382 unsigned int num)
1383{
1384 return skb_get_queue_mapping(skb) % num;
1385}
1386
47dceb8e
WB
1387static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1388 struct sk_buff *skb,
1389 unsigned int num)
1390{
1391 struct bpf_prog *prog;
1392 unsigned int ret = 0;
1393
1394 rcu_read_lock();
1395 prog = rcu_dereference(f->bpf_prog);
1396 if (prog)
ff936a04 1397 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1398 rcu_read_unlock();
1399
1400 return ret;
1401}
1402
77f65ebd
WB
1403static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1404{
1405 return f->flags & (flag >> 8);
95ec3eb4
DM
1406}
1407
95ec3eb4
DM
1408static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1409 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1410{
1411 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1412 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1413 struct net *net = read_pnet(&f->net);
dc99f600 1414 struct packet_sock *po;
77f65ebd 1415 unsigned int idx;
dc99f600 1416
19bcf9f2 1417 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1418 kfree_skb(skb);
1419 return 0;
1420 }
1421
3f34b24a 1422 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1423 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1424 if (!skb)
1425 return 0;
1426 }
95ec3eb4
DM
1427 switch (f->type) {
1428 case PACKET_FANOUT_HASH:
1429 default:
77f65ebd 1430 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1431 break;
1432 case PACKET_FANOUT_LB:
77f65ebd 1433 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1434 break;
1435 case PACKET_FANOUT_CPU:
77f65ebd
WB
1436 idx = fanout_demux_cpu(f, skb, num);
1437 break;
5df0ddfb
DB
1438 case PACKET_FANOUT_RND:
1439 idx = fanout_demux_rnd(f, skb, num);
1440 break;
2d36097d
NH
1441 case PACKET_FANOUT_QM:
1442 idx = fanout_demux_qm(f, skb, num);
1443 break;
77f65ebd 1444 case PACKET_FANOUT_ROLLOVER:
ad377cab 1445 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1446 break;
47dceb8e 1447 case PACKET_FANOUT_CBPF:
f2e52095 1448 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1449 idx = fanout_demux_bpf(f, skb, num);
1450 break;
dc99f600
DM
1451 }
1452
ad377cab
WB
1453 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1454 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1455
ad377cab 1456 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1457 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1458}
1459
fff3321d
PE
1460DEFINE_MUTEX(fanout_mutex);
1461EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1462static LIST_HEAD(fanout_list);
4a69a864 1463static u16 fanout_next_id;
dc99f600
DM
1464
1465static void __fanout_link(struct sock *sk, struct packet_sock *po)
1466{
1467 struct packet_fanout *f = po->fanout;
1468
1469 spin_lock(&f->lock);
1470 f->arr[f->num_members] = sk;
1471 smp_wmb();
1472 f->num_members++;
2bd624b4
AS
1473 if (f->num_members == 1)
1474 dev_add_pack(&f->prot_hook);
dc99f600
DM
1475 spin_unlock(&f->lock);
1476}
1477
1478static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1479{
1480 struct packet_fanout *f = po->fanout;
1481 int i;
1482
1483 spin_lock(&f->lock);
1484 for (i = 0; i < f->num_members; i++) {
1485 if (f->arr[i] == sk)
1486 break;
1487 }
1488 BUG_ON(i >= f->num_members);
1489 f->arr[i] = f->arr[f->num_members - 1];
1490 f->num_members--;
2bd624b4
AS
1491 if (f->num_members == 0)
1492 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1493 spin_unlock(&f->lock);
1494}
1495
d4dd8aee 1496static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1497{
161642e2
ED
1498 if (sk->sk_family != PF_PACKET)
1499 return false;
c0de08d0 1500
161642e2 1501 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1502}
1503
47dceb8e
WB
1504static void fanout_init_data(struct packet_fanout *f)
1505{
1506 switch (f->type) {
1507 case PACKET_FANOUT_LB:
1508 atomic_set(&f->rr_cur, 0);
1509 break;
1510 case PACKET_FANOUT_CBPF:
f2e52095 1511 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1512 RCU_INIT_POINTER(f->bpf_prog, NULL);
1513 break;
1514 }
1515}
1516
1517static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1518{
1519 struct bpf_prog *old;
1520
1521 spin_lock(&f->lock);
1522 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1523 rcu_assign_pointer(f->bpf_prog, new);
1524 spin_unlock(&f->lock);
1525
1526 if (old) {
1527 synchronize_net();
1528 bpf_prog_destroy(old);
1529 }
1530}
1531
1532static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1533 unsigned int len)
1534{
1535 struct bpf_prog *new;
1536 struct sock_fprog fprog;
1537 int ret;
1538
1539 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1540 return -EPERM;
1541 if (len != sizeof(fprog))
1542 return -EINVAL;
1543 if (copy_from_user(&fprog, data, len))
1544 return -EFAULT;
1545
bab18991 1546 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1547 if (ret)
1548 return ret;
1549
1550 __fanout_set_data_bpf(po->fanout, new);
1551 return 0;
1552}
1553
f2e52095
WB
1554static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1555 unsigned int len)
1556{
1557 struct bpf_prog *new;
1558 u32 fd;
1559
1560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1561 return -EPERM;
1562 if (len != sizeof(fd))
1563 return -EINVAL;
1564 if (copy_from_user(&fd, data, len))
1565 return -EFAULT;
1566
113214be 1567 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1568 if (IS_ERR(new))
1569 return PTR_ERR(new);
f2e52095
WB
1570
1571 __fanout_set_data_bpf(po->fanout, new);
1572 return 0;
1573}
1574
47dceb8e
WB
1575static int fanout_set_data(struct packet_sock *po, char __user *data,
1576 unsigned int len)
1577{
1578 switch (po->fanout->type) {
1579 case PACKET_FANOUT_CBPF:
1580 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1581 case PACKET_FANOUT_EBPF:
1582 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1583 default:
1584 return -EINVAL;
07d53ae4 1585 }
47dceb8e
WB
1586}
1587
1588static void fanout_release_data(struct packet_fanout *f)
1589{
1590 switch (f->type) {
1591 case PACKET_FANOUT_CBPF:
f2e52095 1592 case PACKET_FANOUT_EBPF:
47dceb8e 1593 __fanout_set_data_bpf(f, NULL);
07d53ae4 1594 }
47dceb8e
WB
1595}
1596
4a69a864
MM
1597static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1598{
1599 struct packet_fanout *f;
1600
1601 list_for_each_entry(f, &fanout_list, list) {
1602 if (f->id == candidate_id &&
1603 read_pnet(&f->net) == sock_net(sk)) {
1604 return false;
1605 }
1606 }
1607 return true;
1608}
1609
1610static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1611{
1612 u16 id = fanout_next_id;
1613
1614 do {
1615 if (__fanout_id_is_free(sk, id)) {
1616 *new_id = id;
1617 fanout_next_id = id + 1;
1618 return true;
1619 }
1620
1621 id++;
1622 } while (id != fanout_next_id);
1623
1624 return false;
1625}
1626
7736d33f 1627static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1628{
d199fab6 1629 struct packet_rollover *rollover = NULL;
dc99f600
DM
1630 struct packet_sock *po = pkt_sk(sk);
1631 struct packet_fanout *f, *match;
7736d33f 1632 u8 type = type_flags & 0xff;
77f65ebd 1633 u8 flags = type_flags >> 8;
dc99f600
DM
1634 int err;
1635
1636 switch (type) {
77f65ebd
WB
1637 case PACKET_FANOUT_ROLLOVER:
1638 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1639 return -EINVAL;
dc99f600
DM
1640 case PACKET_FANOUT_HASH:
1641 case PACKET_FANOUT_LB:
95ec3eb4 1642 case PACKET_FANOUT_CPU:
5df0ddfb 1643 case PACKET_FANOUT_RND:
2d36097d 1644 case PACKET_FANOUT_QM:
47dceb8e 1645 case PACKET_FANOUT_CBPF:
f2e52095 1646 case PACKET_FANOUT_EBPF:
dc99f600
DM
1647 break;
1648 default:
1649 return -EINVAL;
1650 }
1651
d199fab6
ED
1652 mutex_lock(&fanout_mutex);
1653
d199fab6 1654 err = -EALREADY;
dc99f600 1655 if (po->fanout)
d199fab6 1656 goto out;
dc99f600 1657
4633c9e0
WB
1658 if (type == PACKET_FANOUT_ROLLOVER ||
1659 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1660 err = -ENOMEM;
1661 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1662 if (!rollover)
1663 goto out;
1664 atomic_long_set(&rollover->num, 0);
1665 atomic_long_set(&rollover->num_huge, 0);
1666 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1667 }
1668
4a69a864
MM
1669 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1670 if (id != 0) {
1671 err = -EINVAL;
1672 goto out;
1673 }
1674 if (!fanout_find_new_id(sk, &id)) {
1675 err = -ENOMEM;
1676 goto out;
1677 }
1678 /* ephemeral flag for the first socket in the group: drop it */
1679 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1680 }
1681
dc99f600
DM
1682 match = NULL;
1683 list_for_each_entry(f, &fanout_list, list) {
1684 if (f->id == id &&
1685 read_pnet(&f->net) == sock_net(sk)) {
1686 match = f;
1687 break;
1688 }
1689 }
afe62c68 1690 err = -EINVAL;
77f65ebd 1691 if (match && match->flags != flags)
afe62c68 1692 goto out;
dc99f600 1693 if (!match) {
afe62c68 1694 err = -ENOMEM;
dc99f600 1695 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1696 if (!match)
1697 goto out;
1698 write_pnet(&match->net, sock_net(sk));
1699 match->id = id;
1700 match->type = type;
77f65ebd 1701 match->flags = flags;
afe62c68
ED
1702 INIT_LIST_HEAD(&match->list);
1703 spin_lock_init(&match->lock);
fb5c2c17 1704 refcount_set(&match->sk_ref, 0);
47dceb8e 1705 fanout_init_data(match);
afe62c68
ED
1706 match->prot_hook.type = po->prot_hook.type;
1707 match->prot_hook.dev = po->prot_hook.dev;
1708 match->prot_hook.func = packet_rcv_fanout;
1709 match->prot_hook.af_packet_priv = match;
c0de08d0 1710 match->prot_hook.id_match = match_fanout_group;
afe62c68 1711 list_add(&match->list, &fanout_list);
dc99f600 1712 }
afe62c68 1713 err = -EINVAL;
008ba2a1
WB
1714
1715 spin_lock(&po->bind_lock);
1716 if (po->running &&
1717 match->type == type &&
afe62c68
ED
1718 match->prot_hook.type == po->prot_hook.type &&
1719 match->prot_hook.dev == po->prot_hook.dev) {
1720 err = -ENOSPC;
fb5c2c17 1721 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1722 __dev_remove_pack(&po->prot_hook);
1723 po->fanout = match;
57f015f5
MM
1724 po->rollover = rollover;
1725 rollover = NULL;
fb5c2c17 1726 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1727 __fanout_link(sk, po);
1728 err = 0;
dc99f600
DM
1729 }
1730 }
008ba2a1
WB
1731 spin_unlock(&po->bind_lock);
1732
1733 if (err && !refcount_read(&match->sk_ref)) {
1734 list_del(&match->list);
1735 kfree(match);
1736 }
1737
afe62c68 1738out:
57f015f5 1739 kfree(rollover);
d199fab6 1740 mutex_unlock(&fanout_mutex);
dc99f600
DM
1741 return err;
1742}
1743
2bd624b4
AS
1744/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1745 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1746 * It is the responsibility of the caller to call fanout_release_data() and
1747 * free the returned packet_fanout (after synchronize_net())
1748 */
1749static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1750{
1751 struct packet_sock *po = pkt_sk(sk);
1752 struct packet_fanout *f;
1753
fff3321d 1754 mutex_lock(&fanout_mutex);
d199fab6
ED
1755 f = po->fanout;
1756 if (f) {
1757 po->fanout = NULL;
1758
fb5c2c17 1759 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1760 list_del(&f->list);
2bd624b4
AS
1761 else
1762 f = NULL;
dc99f600
DM
1763 }
1764 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1765
1766 return f;
dc99f600 1767}
1da177e4 1768
3c70c132
DB
1769static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1770 struct sk_buff *skb)
1771{
1772 /* Earlier code assumed this would be a VLAN pkt, double-check
1773 * this now that we have the actual packet in hand. We can only
1774 * do this check on Ethernet devices.
1775 */
1776 if (unlikely(dev->type != ARPHRD_ETHER))
1777 return false;
1778
1779 skb_reset_mac_header(skb);
1780 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1781}
1782
90ddc4f0 1783static const struct proto_ops packet_ops;
1da177e4 1784
90ddc4f0 1785static const struct proto_ops packet_ops_spkt;
1da177e4 1786
40d4e3df
ED
1787static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1788 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1789{
1790 struct sock *sk;
1791 struct sockaddr_pkt *spkt;
1792
1793 /*
1794 * When we registered the protocol we saved the socket in the data
1795 * field for just this event.
1796 */
1797
1798 sk = pt->af_packet_priv;
1ce4f28b 1799
1da177e4
LT
1800 /*
1801 * Yank back the headers [hope the device set this
1802 * right or kerboom...]
1803 *
1804 * Incoming packets have ll header pulled,
1805 * push it back.
1806 *
98e399f8 1807 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1808 * so that this procedure is noop.
1809 */
1810
1811 if (skb->pkt_type == PACKET_LOOPBACK)
1812 goto out;
1813
09ad9bc7 1814 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1815 goto out;
1816
40d4e3df
ED
1817 skb = skb_share_check(skb, GFP_ATOMIC);
1818 if (skb == NULL)
1da177e4
LT
1819 goto oom;
1820
1821 /* drop any routing info */
adf30907 1822 skb_dst_drop(skb);
1da177e4 1823
84531c24
PO
1824 /* drop conntrack reference */
1825 nf_reset(skb);
1826
ffbc6111 1827 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1828
98e399f8 1829 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1830
1831 /*
1832 * The SOCK_PACKET socket receives _all_ frames.
1833 */
1834
1835 spkt->spkt_family = dev->type;
1836 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1837 spkt->spkt_protocol = skb->protocol;
1838
1839 /*
1840 * Charge the memory to the socket. This is done specifically
1841 * to prevent sockets using all the memory up.
1842 */
1843
40d4e3df 1844 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1845 return 0;
1846
1847out:
1848 kfree_skb(skb);
1849oom:
1850 return 0;
1851}
1852
1853
1854/*
1855 * Output a raw packet to a device layer. This bypasses all the other
1856 * protocol layers and you must therefore supply it with a complete frame
1857 */
1ce4f28b 1858
1b784140
YX
1859static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1860 size_t len)
1da177e4
LT
1861{
1862 struct sock *sk = sock->sk;
342dfc30 1863 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1864 struct sk_buff *skb = NULL;
1da177e4 1865 struct net_device *dev;
c14ac945 1866 struct sockcm_cookie sockc;
40d4e3df 1867 __be16 proto = 0;
1da177e4 1868 int err;
3bdc0eba 1869 int extra_len = 0;
1ce4f28b 1870
1da177e4 1871 /*
1ce4f28b 1872 * Get and verify the address.
1da177e4
LT
1873 */
1874
40d4e3df 1875 if (saddr) {
1da177e4 1876 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1877 return -EINVAL;
1878 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1879 proto = saddr->spkt_protocol;
1880 } else
1881 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1882
1883 /*
1ce4f28b 1884 * Find the device first to size check it
1da177e4
LT
1885 */
1886
de74e92a 1887 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1888retry:
654d1f8a
ED
1889 rcu_read_lock();
1890 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1891 err = -ENODEV;
1892 if (dev == NULL)
1893 goto out_unlock;
1ce4f28b 1894
d5e76b0a
DM
1895 err = -ENETDOWN;
1896 if (!(dev->flags & IFF_UP))
1897 goto out_unlock;
1898
1da177e4 1899 /*
40d4e3df
ED
1900 * You may not queue a frame bigger than the mtu. This is the lowest level
1901 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1902 */
1ce4f28b 1903
3bdc0eba
BG
1904 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1905 if (!netif_supports_nofcs(dev)) {
1906 err = -EPROTONOSUPPORT;
1907 goto out_unlock;
1908 }
1909 extra_len = 4; /* We're doing our own CRC */
1910 }
1911
1da177e4 1912 err = -EMSGSIZE;
3bdc0eba 1913 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1914 goto out_unlock;
1915
1a35ca80
ED
1916 if (!skb) {
1917 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1918 int tlen = dev->needed_tailroom;
1a35ca80
ED
1919 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1920
1921 rcu_read_unlock();
4ce40912 1922 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1923 if (skb == NULL)
1924 return -ENOBUFS;
1925 /* FIXME: Save some space for broken drivers that write a hard
1926 * header at transmission time by themselves. PPP is the notable
1927 * one here. This should really be fixed at the driver level.
1928 */
1929 skb_reserve(skb, reserved);
1930 skb_reset_network_header(skb);
1931
1932 /* Try to align data part correctly */
1933 if (hhlen) {
1934 skb->data -= hhlen;
1935 skb->tail -= hhlen;
1936 if (len < hhlen)
1937 skb_reset_network_header(skb);
1938 }
6ce8e9ce 1939 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1940 if (err)
1941 goto out_free;
1942 goto retry;
1da177e4
LT
1943 }
1944
9ed988cd
WB
1945 if (!dev_validate_header(dev, skb->data, len)) {
1946 err = -EINVAL;
1947 goto out_unlock;
1948 }
3c70c132
DB
1949 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1950 !packet_extra_vlan_len_allowed(dev, skb)) {
1951 err = -EMSGSIZE;
1952 goto out_unlock;
57f89bfa 1953 }
1a35ca80 1954
657a0667 1955 sockcm_init(&sockc, sk);
c14ac945
SHY
1956 if (msg->msg_controllen) {
1957 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1958 if (unlikely(err))
c14ac945 1959 goto out_unlock;
c14ac945
SHY
1960 }
1961
1da177e4
LT
1962 skb->protocol = proto;
1963 skb->dev = dev;
1964 skb->priority = sk->sk_priority;
2d37a186 1965 skb->mark = sk->sk_mark;
3d0ba8c0 1966 skb->tstamp = sockc.transmit_time;
bf84a010 1967
c14ac945 1968 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1969
3bdc0eba
BG
1970 if (unlikely(extra_len == 4))
1971 skb->no_fcs = 1;
1972
40893fd0 1973 skb_probe_transport_header(skb, 0);
c1aad275 1974
1da177e4 1975 dev_queue_xmit(skb);
654d1f8a 1976 rcu_read_unlock();
40d4e3df 1977 return len;
1da177e4 1978
1da177e4 1979out_unlock:
654d1f8a 1980 rcu_read_unlock();
1a35ca80
ED
1981out_free:
1982 kfree_skb(skb);
1da177e4
LT
1983 return err;
1984}
1da177e4 1985
ff936a04
AS
1986static unsigned int run_filter(struct sk_buff *skb,
1987 const struct sock *sk,
1988 unsigned int res)
1da177e4
LT
1989{
1990 struct sk_filter *filter;
fda9ef5d 1991
80f8f102
ED
1992 rcu_read_lock();
1993 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1994 if (filter != NULL)
ff936a04 1995 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1996 rcu_read_unlock();
1da177e4 1997
dbcb5855 1998 return res;
1da177e4
LT
1999}
2000
16cc1400
WB
2001static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2002 size_t *len)
2003{
2004 struct virtio_net_hdr vnet_hdr;
2005
2006 if (*len < sizeof(vnet_hdr))
2007 return -EINVAL;
2008 *len -= sizeof(vnet_hdr);
2009
fd3a8862 2010 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2011 return -EINVAL;
2012
2013 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2014}
2015
1da177e4 2016/*
62ab0812
ED
2017 * This function makes lazy skb cloning in hope that most of packets
2018 * are discarded by BPF.
2019 *
2020 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2021 * and skb->cb are mangled. It works because (and until) packets
2022 * falling here are owned by current CPU. Output packets are cloned
2023 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2024 * sequencially, so that if we return skb to original state on exit,
2025 * we will not harm anyone.
1da177e4
LT
2026 */
2027
40d4e3df
ED
2028static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2029 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2030{
2031 struct sock *sk;
2032 struct sockaddr_ll *sll;
2033 struct packet_sock *po;
40d4e3df 2034 u8 *skb_head = skb->data;
1da177e4 2035 int skb_len = skb->len;
dbcb5855 2036 unsigned int snaplen, res;
da37845f 2037 bool is_drop_n_account = false;
1da177e4
LT
2038
2039 if (skb->pkt_type == PACKET_LOOPBACK)
2040 goto drop;
2041
2042 sk = pt->af_packet_priv;
2043 po = pkt_sk(sk);
2044
09ad9bc7 2045 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2046 goto drop;
2047
1da177e4
LT
2048 skb->dev = dev;
2049
3b04ddde 2050 if (dev->header_ops) {
1da177e4 2051 /* The device has an explicit notion of ll header,
62ab0812
ED
2052 * exported to higher levels.
2053 *
2054 * Otherwise, the device hides details of its frame
2055 * structure, so that corresponding packet head is
2056 * never delivered to user.
1da177e4
LT
2057 */
2058 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2059 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2060 else if (skb->pkt_type == PACKET_OUTGOING) {
2061 /* Special case: outgoing packets have ll header at head */
bbe735e4 2062 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2063 }
2064 }
2065
2066 snaplen = skb->len;
2067
dbcb5855
DM
2068 res = run_filter(skb, sk, snaplen);
2069 if (!res)
fda9ef5d 2070 goto drop_n_restore;
dbcb5855
DM
2071 if (snaplen > res)
2072 snaplen = res;
1da177e4 2073
0fd7bac6 2074 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2075 goto drop_n_acct;
2076
2077 if (skb_shared(skb)) {
2078 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2079 if (nskb == NULL)
2080 goto drop_n_acct;
2081
2082 if (skb_head != skb->data) {
2083 skb->data = skb_head;
2084 skb->len = skb_len;
2085 }
abc4e4fa 2086 consume_skb(skb);
1da177e4
LT
2087 skb = nskb;
2088 }
2089
b4772ef8 2090 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2091
2092 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2093 sll->sll_hatype = dev->type;
1da177e4 2094 sll->sll_pkttype = skb->pkt_type;
8032b464 2095 if (unlikely(po->origdev))
80feaacb
PWJ
2096 sll->sll_ifindex = orig_dev->ifindex;
2097 else
2098 sll->sll_ifindex = dev->ifindex;
1da177e4 2099
b95cce35 2100 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2101
2472d761
EB
2102 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2103 * Use their space for storing the original skb length.
2104 */
2105 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2106
1da177e4
LT
2107 if (pskb_trim(skb, snaplen))
2108 goto drop_n_acct;
2109
2110 skb_set_owner_r(skb, sk);
2111 skb->dev = NULL;
adf30907 2112 skb_dst_drop(skb);
1da177e4 2113
84531c24
PO
2114 /* drop conntrack reference */
2115 nf_reset(skb);
2116
1da177e4 2117 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2118 po->stats.stats1.tp_packets++;
3bc3b96f 2119 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2120 __skb_queue_tail(&sk->sk_receive_queue, skb);
2121 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2122 sk->sk_data_ready(sk);
1da177e4
LT
2123 return 0;
2124
2125drop_n_acct:
da37845f 2126 is_drop_n_account = true;
7091fbd8 2127 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2128 po->stats.stats1.tp_drops++;
7091fbd8
WB
2129 atomic_inc(&sk->sk_drops);
2130 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2131
2132drop_n_restore:
2133 if (skb_head != skb->data && skb_shared(skb)) {
2134 skb->data = skb_head;
2135 skb->len = skb_len;
2136 }
2137drop:
da37845f
WJ
2138 if (!is_drop_n_account)
2139 consume_skb(skb);
2140 else
2141 kfree_skb(skb);
1da177e4
LT
2142 return 0;
2143}
2144
40d4e3df
ED
2145static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2146 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2147{
2148 struct sock *sk;
2149 struct packet_sock *po;
2150 struct sockaddr_ll *sll;
184f489e 2151 union tpacket_uhdr h;
40d4e3df 2152 u8 *skb_head = skb->data;
1da177e4 2153 int skb_len = skb->len;
dbcb5855 2154 unsigned int snaplen, res;
f6fb8f10 2155 unsigned long status = TP_STATUS_USER;
bbd6ef87 2156 unsigned short macoff, netoff, hdrlen;
1da177e4 2157 struct sk_buff *copy_skb = NULL;
bbd6ef87 2158 struct timespec ts;
b9c32fb2 2159 __u32 ts_status;
da37845f 2160 bool is_drop_n_account = false;
edbd58be 2161 bool do_vnet = false;
1da177e4 2162
51846355
AW
2163 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2164 * We may add members to them until current aligned size without forcing
2165 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2166 */
2167 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2168 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2169
1da177e4
LT
2170 if (skb->pkt_type == PACKET_LOOPBACK)
2171 goto drop;
2172
2173 sk = pt->af_packet_priv;
2174 po = pkt_sk(sk);
2175
09ad9bc7 2176 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2177 goto drop;
2178
3b04ddde 2179 if (dev->header_ops) {
1da177e4 2180 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2181 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2182 else if (skb->pkt_type == PACKET_OUTGOING) {
2183 /* Special case: outgoing packets have ll header at head */
bbe735e4 2184 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2185 }
2186 }
2187
2188 snaplen = skb->len;
2189
dbcb5855
DM
2190 res = run_filter(skb, sk, snaplen);
2191 if (!res)
fda9ef5d 2192 goto drop_n_restore;
68c2e5de
AD
2193
2194 if (skb->ip_summed == CHECKSUM_PARTIAL)
2195 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2196 else if (skb->pkt_type != PACKET_OUTGOING &&
2197 (skb->ip_summed == CHECKSUM_COMPLETE ||
2198 skb_csum_unnecessary(skb)))
2199 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2200
dbcb5855
DM
2201 if (snaplen > res)
2202 snaplen = res;
1da177e4
LT
2203
2204 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2205 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2206 po->tp_reserve;
1da177e4 2207 } else {
95c96174 2208 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2209 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2210 (maclen < 16 ? 16 : maclen)) +
58d19b19 2211 po->tp_reserve;
edbd58be 2212 if (po->has_vnet_hdr) {
58d19b19 2213 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2214 do_vnet = true;
2215 }
1da177e4
LT
2216 macoff = netoff - maclen;
2217 }
f6fb8f10 2218 if (po->tp_version <= TPACKET_V2) {
2219 if (macoff + snaplen > po->rx_ring.frame_size) {
2220 if (po->copy_thresh &&
0fd7bac6 2221 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2222 if (skb_shared(skb)) {
2223 copy_skb = skb_clone(skb, GFP_ATOMIC);
2224 } else {
2225 copy_skb = skb_get(skb);
2226 skb_head = skb->data;
2227 }
2228 if (copy_skb)
2229 skb_set_owner_r(copy_skb, sk);
1da177e4 2230 }
f6fb8f10 2231 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2232 if ((int)snaplen < 0) {
f6fb8f10 2233 snaplen = 0;
edbd58be
BP
2234 do_vnet = false;
2235 }
1da177e4 2236 }
dc808110
ED
2237 } else if (unlikely(macoff + snaplen >
2238 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2239 u32 nval;
2240
2241 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2242 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2243 snaplen, nval, macoff);
2244 snaplen = nval;
2245 if (unlikely((int)snaplen < 0)) {
2246 snaplen = 0;
2247 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2248 do_vnet = false;
dc808110 2249 }
1da177e4 2250 }
1da177e4 2251 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2252 h.raw = packet_current_rx_frame(po, skb,
2253 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2254 if (!h.raw)
58d19b19 2255 goto drop_n_account;
f6fb8f10 2256 if (po->tp_version <= TPACKET_V2) {
2257 packet_increment_rx_head(po, &po->rx_ring);
2258 /*
2259 * LOSING will be reported till you read the stats,
2260 * because it's COR - Clear On Read.
2261 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2262 * at packet level.
2263 */
ee80fbf3 2264 if (po->stats.stats1.tp_drops)
f6fb8f10 2265 status |= TP_STATUS_LOSING;
2266 }
945d015e
ED
2267
2268 if (do_vnet &&
2269 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2270 sizeof(struct virtio_net_hdr),
2271 vio_le(), true, 0))
2272 goto drop_n_account;
2273
ee80fbf3 2274 po->stats.stats1.tp_packets++;
1da177e4
LT
2275 if (copy_skb) {
2276 status |= TP_STATUS_COPY;
2277 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2278 }
1da177e4
LT
2279 spin_unlock(&sk->sk_receive_queue.lock);
2280
bbd6ef87 2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2282
2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2284 getnstimeofday(&ts);
1da177e4 2285
b9c32fb2
DB
2286 status |= ts_status;
2287
bbd6ef87
PM
2288 switch (po->tp_version) {
2289 case TPACKET_V1:
2290 h.h1->tp_len = skb->len;
2291 h.h1->tp_snaplen = snaplen;
2292 h.h1->tp_mac = macoff;
2293 h.h1->tp_net = netoff;
4b457bdf
DB
2294 h.h1->tp_sec = ts.tv_sec;
2295 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2296 hdrlen = sizeof(*h.h1);
2297 break;
2298 case TPACKET_V2:
2299 h.h2->tp_len = skb->len;
2300 h.h2->tp_snaplen = snaplen;
2301 h.h2->tp_mac = macoff;
2302 h.h2->tp_net = netoff;
bbd6ef87
PM
2303 h.h2->tp_sec = ts.tv_sec;
2304 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2305 if (skb_vlan_tag_present(skb)) {
2306 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2307 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2308 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2309 } else {
2310 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2311 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2312 }
e4d26f4b 2313 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2314 hdrlen = sizeof(*h.h2);
2315 break;
f6fb8f10 2316 case TPACKET_V3:
2317 /* tp_nxt_offset,vlan are already populated above.
2318 * So DONT clear those fields here
2319 */
2320 h.h3->tp_status |= status;
2321 h.h3->tp_len = skb->len;
2322 h.h3->tp_snaplen = snaplen;
2323 h.h3->tp_mac = macoff;
2324 h.h3->tp_net = netoff;
f6fb8f10 2325 h.h3->tp_sec = ts.tv_sec;
2326 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2327 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2328 hdrlen = sizeof(*h.h3);
2329 break;
bbd6ef87
PM
2330 default:
2331 BUG();
2332 }
1da177e4 2333
bbd6ef87 2334 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2335 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2336 sll->sll_family = AF_PACKET;
2337 sll->sll_hatype = dev->type;
2338 sll->sll_protocol = skb->protocol;
2339 sll->sll_pkttype = skb->pkt_type;
8032b464 2340 if (unlikely(po->origdev))
80feaacb
PWJ
2341 sll->sll_ifindex = orig_dev->ifindex;
2342 else
2343 sll->sll_ifindex = dev->ifindex;
1da177e4 2344
e16aa207 2345 smp_mb();
f0d4eb29 2346
f6dafa95 2347#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2348 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2349 u8 *start, *end;
2350
f0d4eb29
DB
2351 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2352 macoff + snaplen);
2353
2354 for (start = h.raw; start < end; start += PAGE_SIZE)
2355 flush_dcache_page(pgv_to_page(start));
1da177e4 2356 }
f0d4eb29 2357 smp_wmb();
f6dafa95 2358#endif
f0d4eb29 2359
da413eec 2360 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2361 __packet_set_status(po, h.raw, status);
da413eec
DC
2362 sk->sk_data_ready(sk);
2363 } else {
f6fb8f10 2364 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2365 }
1da177e4
LT
2366
2367drop_n_restore:
2368 if (skb_head != skb->data && skb_shared(skb)) {
2369 skb->data = skb_head;
2370 skb->len = skb_len;
2371 }
2372drop:
da37845f
WJ
2373 if (!is_drop_n_account)
2374 consume_skb(skb);
2375 else
2376 kfree_skb(skb);
1da177e4
LT
2377 return 0;
2378
58d19b19 2379drop_n_account:
da37845f 2380 is_drop_n_account = true;
ee80fbf3 2381 po->stats.stats1.tp_drops++;
1da177e4
LT
2382 spin_unlock(&sk->sk_receive_queue.lock);
2383
676d2369 2384 sk->sk_data_ready(sk);
acb5d75b 2385 kfree_skb(copy_skb);
1da177e4
LT
2386 goto drop_n_restore;
2387}
2388
69e3c75f
JB
2389static void tpacket_destruct_skb(struct sk_buff *skb)
2390{
2391 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2392
69e3c75f 2393 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2394 void *ph;
b9c32fb2
DB
2395 __u32 ts;
2396
5cd8d46e 2397 ph = skb_zcopy_get_nouarg(skb);
b0138408 2398 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2399
2400 ts = __packet_set_timestamp(po, ph, skb);
2401 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2402 }
2403
2404 sock_wfree(skb);
2405}
2406
c72219b7
DB
2407static void tpacket_set_protocol(const struct net_device *dev,
2408 struct sk_buff *skb)
2409{
2410 if (dev->type == ARPHRD_ETHER) {
2411 skb_reset_mac_header(skb);
2412 skb->protocol = eth_hdr(skb)->h_proto;
2413 }
2414}
2415
16cc1400
WB
2416static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417{
16cc1400
WB
2418 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2419 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2422 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425
2426 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2427 return -EINVAL;
2428
16cc1400
WB
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
16cc1400
WB
2435 if (*len < sizeof(*vnet_hdr))
2436 return -EINVAL;
2437 *len -= sizeof(*vnet_hdr);
2438
cbbd26b8 2439 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2440 return -EFAULT;
2441
2442 return __packet_snd_vnet_parse(vnet_hdr, *len);
2443}
2444
40d4e3df 2445static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2446 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2447 __be16 proto, unsigned char *addr, int hlen, int copylen,
2448 const struct sockcm_cookie *sockc)
69e3c75f 2449{
184f489e 2450 union tpacket_uhdr ph;
8d39b4a6 2451 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2452 struct socket *sock = po->sk.sk_socket;
2453 struct page *page;
69e3c75f
JB
2454 int err;
2455
2456 ph.raw = frame;
2457
2458 skb->protocol = proto;
2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority;
2d37a186 2461 skb->mark = po->sk.sk_mark;
3d0ba8c0 2462 skb->tstamp = sockc->transmit_time;
c14ac945 2463 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
5cd8d46e 2464 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2465
ae641949 2466 skb_reserve(skb, hlen);
69e3c75f 2467 skb_reset_network_header(skb);
c1aad275 2468
69e3c75f
JB
2469 to_write = tp_len;
2470
2471 if (sock->type == SOCK_DGRAM) {
2472 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2473 NULL, tp_len);
2474 if (unlikely(err < 0))
2475 return -EINVAL;
1d036d25 2476 } else if (copylen) {
9ed988cd
WB
2477 int hdrlen = min_t(int, copylen, tp_len);
2478
69e3c75f 2479 skb_push(skb, dev->hard_header_len);
1d036d25 2480 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2481 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2482 if (unlikely(err))
2483 return err;
9ed988cd
WB
2484 if (!dev_validate_header(dev, skb->data, hdrlen))
2485 return -EINVAL;
c72219b7
DB
2486 if (!skb->protocol)
2487 tpacket_set_protocol(dev, skb);
69e3c75f 2488
9ed988cd
WB
2489 data += hdrlen;
2490 to_write -= hdrlen;
69e3c75f
JB
2491 }
2492
69e3c75f
JB
2493 offset = offset_in_page(data);
2494 len_max = PAGE_SIZE - offset;
2495 len = ((to_write > len_max) ? len_max : to_write);
2496
2497 skb->data_len = to_write;
2498 skb->len += to_write;
2499 skb->truesize += to_write;
14afee4b 2500 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2501
2502 while (likely(to_write)) {
2503 nr_frags = skb_shinfo(skb)->nr_frags;
2504
2505 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2506 pr_err("Packet exceed the number of skb frags(%lu)\n",
2507 MAX_SKB_FRAGS);
69e3c75f
JB
2508 return -EFAULT;
2509 }
2510
0af55bb5
CG
2511 page = pgv_to_page(data);
2512 data += len;
69e3c75f
JB
2513 flush_dcache_page(page);
2514 get_page(page);
0af55bb5 2515 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2516 to_write -= len;
2517 offset = 0;
2518 len_max = PAGE_SIZE;
2519 len = ((to_write > len_max) ? len_max : to_write);
2520 }
2521
8fd6c80d 2522 skb_probe_transport_header(skb, 0);
efdfa2f7 2523
69e3c75f
JB
2524 return tp_len;
2525}
2526
8d39b4a6
WB
2527static int tpacket_parse_header(struct packet_sock *po, void *frame,
2528 int size_max, void **data)
2529{
2530 union tpacket_uhdr ph;
2531 int tp_len, off;
2532
2533 ph.raw = frame;
2534
2535 switch (po->tp_version) {
7f953ab2
SV
2536 case TPACKET_V3:
2537 if (ph.h3->tp_next_offset != 0) {
2538 pr_warn_once("variable sized slot not supported");
2539 return -EINVAL;
2540 }
2541 tp_len = ph.h3->tp_len;
2542 break;
8d39b4a6
WB
2543 case TPACKET_V2:
2544 tp_len = ph.h2->tp_len;
2545 break;
2546 default:
2547 tp_len = ph.h1->tp_len;
2548 break;
2549 }
2550 if (unlikely(tp_len > size_max)) {
2551 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2552 return -EMSGSIZE;
2553 }
2554
2555 if (unlikely(po->tp_tx_has_off)) {
2556 int off_min, off_max;
2557
2558 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2559 off_max = po->tx_ring.frame_size - tp_len;
2560 if (po->sk.sk_type == SOCK_DGRAM) {
2561 switch (po->tp_version) {
7f953ab2
SV
2562 case TPACKET_V3:
2563 off = ph.h3->tp_net;
2564 break;
8d39b4a6
WB
2565 case TPACKET_V2:
2566 off = ph.h2->tp_net;
2567 break;
2568 default:
2569 off = ph.h1->tp_net;
2570 break;
2571 }
2572 } else {
2573 switch (po->tp_version) {
7f953ab2
SV
2574 case TPACKET_V3:
2575 off = ph.h3->tp_mac;
2576 break;
8d39b4a6
WB
2577 case TPACKET_V2:
2578 off = ph.h2->tp_mac;
2579 break;
2580 default:
2581 off = ph.h1->tp_mac;
2582 break;
2583 }
2584 }
2585 if (unlikely((off < off_min) || (off_max < off)))
2586 return -EINVAL;
2587 } else {
2588 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2589 }
2590
2591 *data = frame + off;
2592 return tp_len;
2593}
2594
69e3c75f
JB
2595static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2596{
69e3c75f
JB
2597 struct sk_buff *skb;
2598 struct net_device *dev;
1d036d25 2599 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2600 struct sockcm_cookie sockc;
69e3c75f 2601 __be16 proto;
09effa67 2602 int err, reserve = 0;
40d4e3df 2603 void *ph;
342dfc30 2604 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2605 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2606 int tp_len, size_max;
2607 unsigned char *addr;
8d39b4a6 2608 void *data;
69e3c75f 2609 int len_sum = 0;
9e67030a 2610 int status = TP_STATUS_AVAILABLE;
1d036d25 2611 int hlen, tlen, copylen = 0;
69e3c75f 2612
69e3c75f
JB
2613 mutex_lock(&po->pg_vec_lock);
2614
66e56cd4 2615 if (likely(saddr == NULL)) {
e40526cb 2616 dev = packet_cached_dev_get(po);
69e3c75f
JB
2617 proto = po->num;
2618 addr = NULL;
2619 } else {
2620 err = -EINVAL;
2621 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2622 goto out;
2623 if (msg->msg_namelen < (saddr->sll_halen
2624 + offsetof(struct sockaddr_ll,
2625 sll_addr)))
2626 goto out;
69e3c75f
JB
2627 proto = saddr->sll_protocol;
2628 addr = saddr->sll_addr;
827d9780 2629 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2630 }
2631
69e3c75f
JB
2632 err = -ENXIO;
2633 if (unlikely(dev == NULL))
2634 goto out;
69e3c75f
JB
2635 err = -ENETDOWN;
2636 if (unlikely(!(dev->flags & IFF_UP)))
2637 goto out_put;
2638
657a0667 2639 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2640 if (msg->msg_controllen) {
2641 err = sock_cmsg_send(&po->sk, msg, &sockc);
2642 if (unlikely(err))
2643 goto out_put;
2644 }
2645
5cfb4c8d
DB
2646 if (po->sk.sk_socket->type == SOCK_RAW)
2647 reserve = dev->hard_header_len;
69e3c75f 2648 size_max = po->tx_ring.frame_size
b5dd884e 2649 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2650
1d036d25 2651 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2652 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2653
69e3c75f
JB
2654 do {
2655 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2656 TP_STATUS_SEND_REQUEST);
69e3c75f 2657 if (unlikely(ph == NULL)) {
87a2fd28
DB
2658 if (need_wait && need_resched())
2659 schedule();
69e3c75f
JB
2660 continue;
2661 }
2662
8d39b4a6
WB
2663 skb = NULL;
2664 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2665 if (tp_len < 0)
2666 goto tpacket_error;
2667
69e3c75f 2668 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2669 hlen = LL_RESERVED_SPACE(dev);
2670 tlen = dev->needed_tailroom;
1d036d25
WB
2671 if (po->has_vnet_hdr) {
2672 vnet_hdr = data;
2673 data += sizeof(*vnet_hdr);
2674 tp_len -= sizeof(*vnet_hdr);
2675 if (tp_len < 0 ||
2676 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2677 tp_len = -EINVAL;
2678 goto tpacket_error;
2679 }
2680 copylen = __virtio16_to_cpu(vio_le(),
2681 vnet_hdr->hdr_len);
2682 }
9ed988cd 2683 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2684 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2685 hlen + tlen + sizeof(struct sockaddr_ll) +
2686 (copylen - dev->hard_header_len),
fbf33a28 2687 !need_wait, &err);
69e3c75f 2688
fbf33a28
KM
2689 if (unlikely(skb == NULL)) {
2690 /* we assume the socket was initially writeable ... */
2691 if (likely(len_sum > 0))
2692 err = len_sum;
69e3c75f 2693 goto out_status;
fbf33a28 2694 }
8d39b4a6 2695 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2696 addr, hlen, copylen, &sockc);
dbd46ab4 2697 if (likely(tp_len >= 0) &&
5cfb4c8d 2698 tp_len > dev->mtu + reserve &&
1d036d25 2699 !po->has_vnet_hdr &&
3c70c132
DB
2700 !packet_extra_vlan_len_allowed(dev, skb))
2701 tp_len = -EMSGSIZE;
69e3c75f
JB
2702
2703 if (unlikely(tp_len < 0)) {
8d39b4a6 2704tpacket_error:
69e3c75f
JB
2705 if (po->tp_loss) {
2706 __packet_set_status(po, ph,
2707 TP_STATUS_AVAILABLE);
2708 packet_increment_head(&po->tx_ring);
2709 kfree_skb(skb);
2710 continue;
2711 } else {
2712 status = TP_STATUS_WRONG_FORMAT;
2713 err = tp_len;
2714 goto out_status;
2715 }
2716 }
2717
9d2f67e4
JT
2718 if (po->has_vnet_hdr) {
2719 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2720 tp_len = -EINVAL;
2721 goto tpacket_error;
2722 }
2723 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2724 }
2725
69e3c75f
JB
2726 skb->destructor = tpacket_destruct_skb;
2727 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2728 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2729
2730 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2731 err = po->xmit(skb);
eb70df13
JP
2732 if (unlikely(err > 0)) {
2733 err = net_xmit_errno(err);
2734 if (err && __packet_get_status(po, ph) ==
2735 TP_STATUS_AVAILABLE) {
2736 /* skb was destructed already */
2737 skb = NULL;
2738 goto out_status;
2739 }
2740 /*
2741 * skb was dropped but not destructed yet;
2742 * let's treat it like congestion or err < 0
2743 */
2744 err = 0;
2745 }
69e3c75f
JB
2746 packet_increment_head(&po->tx_ring);
2747 len_sum += tp_len;
b0138408
DB
2748 } while (likely((ph != NULL) ||
2749 /* Note: packet_read_pending() might be slow if we have
2750 * to call it as it's per_cpu variable, but in fast-path
2751 * we already short-circuit the loop with the first
2752 * condition, and luckily don't have to go that path
2753 * anyway.
2754 */
2755 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2756
2757 err = len_sum;
2758 goto out_put;
2759
69e3c75f
JB
2760out_status:
2761 __packet_set_status(po, ph, status);
2762 kfree_skb(skb);
2763out_put:
e40526cb 2764 dev_put(dev);
69e3c75f
JB
2765out:
2766 mutex_unlock(&po->pg_vec_lock);
2767 return err;
2768}
69e3c75f 2769
eea49cc9
OJ
2770static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2771 size_t reserve, size_t len,
2772 size_t linear, int noblock,
2773 int *err)
bfd5f4a3
SS
2774{
2775 struct sk_buff *skb;
2776
2777 /* Under a page? Don't bother with paged skb. */
2778 if (prepad + len < PAGE_SIZE || !linear)
2779 linear = len;
2780
2781 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2782 err, 0);
bfd5f4a3
SS
2783 if (!skb)
2784 return NULL;
2785
2786 skb_reserve(skb, reserve);
2787 skb_put(skb, linear);
2788 skb->data_len = len - linear;
2789 skb->len += len - linear;
2790
2791 return skb;
2792}
2793
d346a3fa 2794static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2795{
2796 struct sock *sk = sock->sk;
342dfc30 2797 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2798 struct sk_buff *skb;
2799 struct net_device *dev;
0e11c91e 2800 __be16 proto;
1da177e4 2801 unsigned char *addr;
827d9780 2802 int err, reserve = 0;
c7d39e32 2803 struct sockcm_cookie sockc;
bfd5f4a3
SS
2804 struct virtio_net_hdr vnet_hdr = { 0 };
2805 int offset = 0;
bfd5f4a3 2806 struct packet_sock *po = pkt_sk(sk);
da7c9561 2807 bool has_vnet_hdr = false;
57031eb7 2808 int hlen, tlen, linear;
3bdc0eba 2809 int extra_len = 0;
1da177e4
LT
2810
2811 /*
1ce4f28b 2812 * Get and verify the address.
1da177e4 2813 */
1ce4f28b 2814
66e56cd4 2815 if (likely(saddr == NULL)) {
e40526cb 2816 dev = packet_cached_dev_get(po);
1da177e4
LT
2817 proto = po->num;
2818 addr = NULL;
2819 } else {
2820 err = -EINVAL;
2821 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2822 goto out;
0fb375fb
EB
2823 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2824 goto out;
1da177e4
LT
2825 proto = saddr->sll_protocol;
2826 addr = saddr->sll_addr;
827d9780 2827 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2828 }
2829
1da177e4 2830 err = -ENXIO;
e40526cb 2831 if (unlikely(dev == NULL))
1da177e4 2832 goto out_unlock;
d5e76b0a 2833 err = -ENETDOWN;
e40526cb 2834 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2835 goto out_unlock;
2836
657a0667 2837 sockcm_init(&sockc, sk);
c7d39e32
EJ
2838 sockc.mark = sk->sk_mark;
2839 if (msg->msg_controllen) {
2840 err = sock_cmsg_send(sk, msg, &sockc);
2841 if (unlikely(err))
2842 goto out_unlock;
2843 }
2844
e40526cb
DB
2845 if (sock->type == SOCK_RAW)
2846 reserve = dev->hard_header_len;
bfd5f4a3 2847 if (po->has_vnet_hdr) {
16cc1400
WB
2848 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2849 if (err)
bfd5f4a3 2850 goto out_unlock;
da7c9561 2851 has_vnet_hdr = true;
bfd5f4a3
SS
2852 }
2853
3bdc0eba
BG
2854 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2855 if (!netif_supports_nofcs(dev)) {
2856 err = -EPROTONOSUPPORT;
2857 goto out_unlock;
2858 }
2859 extra_len = 4; /* We're doing our own CRC */
2860 }
2861
1da177e4 2862 err = -EMSGSIZE;
16cc1400
WB
2863 if (!vnet_hdr.gso_type &&
2864 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2865 goto out_unlock;
2866
bfd5f4a3 2867 err = -ENOBUFS;
ae641949
HX
2868 hlen = LL_RESERVED_SPACE(dev);
2869 tlen = dev->needed_tailroom;
57031eb7
WB
2870 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2871 linear = max(linear, min_t(int, len, dev->hard_header_len));
2872 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2873 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2874 if (skb == NULL)
1da177e4
LT
2875 goto out_unlock;
2876
b84bbaf7 2877 skb_reset_network_header(skb);
1da177e4 2878
0c4e8581 2879 err = -EINVAL;
9c707762
WB
2880 if (sock->type == SOCK_DGRAM) {
2881 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2882 if (unlikely(offset < 0))
9c707762 2883 goto out_free;
b84bbaf7 2884 } else if (reserve) {
9aad13b0 2885 skb_reserve(skb, -reserve);
993675a3
WB
2886 if (len < reserve)
2887 skb_reset_network_header(skb);
9c707762 2888 }
1da177e4
LT
2889
2890 /* Returns -EFAULT on error */
c0371da6 2891 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2892 if (err)
2893 goto out_free;
bf84a010 2894
9ed988cd
WB
2895 if (sock->type == SOCK_RAW &&
2896 !dev_validate_header(dev, skb->data, len)) {
2897 err = -EINVAL;
2898 goto out_free;
2899 }
2900
c14ac945 2901 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2902
16cc1400 2903 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2904 !packet_extra_vlan_len_allowed(dev, skb)) {
2905 err = -EMSGSIZE;
2906 goto out_free;
57f89bfa
BG
2907 }
2908
09effa67
DM
2909 skb->protocol = proto;
2910 skb->dev = dev;
1da177e4 2911 skb->priority = sk->sk_priority;
c7d39e32 2912 skb->mark = sockc.mark;
3d0ba8c0 2913 skb->tstamp = sockc.transmit_time;
0fd5d57b 2914
da7c9561 2915 if (has_vnet_hdr) {
db60eb5f 2916 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2917 if (err)
2918 goto out_free;
2919 len += sizeof(vnet_hdr);
9d2f67e4 2920 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2921 }
2922
8fd6c80d
DB
2923 skb_probe_transport_header(skb, reserve);
2924
3bdc0eba
BG
2925 if (unlikely(extra_len == 4))
2926 skb->no_fcs = 1;
2927
d346a3fa 2928 err = po->xmit(skb);
1da177e4
LT
2929 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2930 goto out_unlock;
2931
e40526cb 2932 dev_put(dev);
1da177e4 2933
40d4e3df 2934 return len;
1da177e4
LT
2935
2936out_free:
2937 kfree_skb(skb);
2938out_unlock:
e40526cb 2939 if (dev)
1da177e4
LT
2940 dev_put(dev);
2941out:
2942 return err;
2943}
2944
1b784140 2945static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2946{
69e3c75f
JB
2947 struct sock *sk = sock->sk;
2948 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2949
69e3c75f
JB
2950 if (po->tx_ring.pg_vec)
2951 return tpacket_snd(po, msg);
2952 else
69e3c75f
JB
2953 return packet_snd(sock, msg, len);
2954}
2955
1da177e4
LT
2956/*
2957 * Close a PACKET socket. This is fairly simple. We immediately go
2958 * to 'closed' state and remove our protocol entry in the device list.
2959 */
2960
2961static int packet_release(struct socket *sock)
2962{
2963 struct sock *sk = sock->sk;
2964 struct packet_sock *po;
2bd624b4 2965 struct packet_fanout *f;
d12d01d6 2966 struct net *net;
f6fb8f10 2967 union tpacket_req_u req_u;
1da177e4
LT
2968
2969 if (!sk)
2970 return 0;
2971
3b1e0a65 2972 net = sock_net(sk);
1da177e4
LT
2973 po = pkt_sk(sk);
2974
0fa7fa98 2975 mutex_lock(&net->packet.sklist_lock);
808f5114 2976 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2977 mutex_unlock(&net->packet.sklist_lock);
2978
2979 preempt_disable();
920de804 2980 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2981 preempt_enable();
1da177e4 2982
808f5114 2983 spin_lock(&po->bind_lock);
ce06b03e 2984 unregister_prot_hook(sk, false);
66e56cd4
DB
2985 packet_cached_dev_reset(po);
2986
160ff18a
BG
2987 if (po->prot_hook.dev) {
2988 dev_put(po->prot_hook.dev);
2989 po->prot_hook.dev = NULL;
2990 }
808f5114 2991 spin_unlock(&po->bind_lock);
1da177e4 2992
1da177e4 2993 packet_flush_mclist(sk);
1da177e4 2994
5171b37d 2995 lock_sock(sk);
9665d5d6
PS
2996 if (po->rx_ring.pg_vec) {
2997 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2998 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2999 }
69e3c75f 3000
9665d5d6
PS
3001 if (po->tx_ring.pg_vec) {
3002 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3003 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3004 }
5171b37d 3005 release_sock(sk);
1da177e4 3006
2bd624b4 3007 f = fanout_release(sk);
dc99f600 3008
808f5114 3009 synchronize_net();
2bd624b4
AS
3010
3011 if (f) {
57f015f5 3012 kfree(po->rollover);
2bd624b4
AS
3013 fanout_release_data(f);
3014 kfree(f);
3015 }
1da177e4
LT
3016 /*
3017 * Now the socket is dead. No more input will appear.
3018 */
1da177e4
LT
3019 sock_orphan(sk);
3020 sock->sk = NULL;
3021
3022 /* Purge queues */
3023
3024 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3025 packet_free_pending(po);
17ab56a2 3026 sk_refcnt_debug_release(sk);
1da177e4
LT
3027
3028 sock_put(sk);
3029 return 0;
3030}
3031
3032/*
3033 * Attach a packet hook.
3034 */
3035
30f7ea1c
FR
3036static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3037 __be16 proto)
1da177e4
LT
3038{
3039 struct packet_sock *po = pkt_sk(sk);
158cd4af 3040 struct net_device *dev_curr;
902fefb8
DB
3041 __be16 proto_curr;
3042 bool need_rehook;
30f7ea1c
FR
3043 struct net_device *dev = NULL;
3044 int ret = 0;
3045 bool unlisted = false;
dc99f600 3046
1da177e4 3047 lock_sock(sk);
1da177e4 3048 spin_lock(&po->bind_lock);
30f7ea1c
FR
3049 rcu_read_lock();
3050
4971613c
WB
3051 if (po->fanout) {
3052 ret = -EINVAL;
3053 goto out_unlock;
3054 }
3055
30f7ea1c
FR
3056 if (name) {
3057 dev = dev_get_by_name_rcu(sock_net(sk), name);
3058 if (!dev) {
3059 ret = -ENODEV;
3060 goto out_unlock;
3061 }
3062 } else if (ifindex) {
3063 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3064 if (!dev) {
3065 ret = -ENODEV;
3066 goto out_unlock;
3067 }
3068 }
3069
3070 if (dev)
3071 dev_hold(dev);
66e56cd4 3072
902fefb8
DB
3073 proto_curr = po->prot_hook.type;
3074 dev_curr = po->prot_hook.dev;
3075
3076 need_rehook = proto_curr != proto || dev_curr != dev;
3077
3078 if (need_rehook) {
30f7ea1c
FR
3079 if (po->running) {
3080 rcu_read_unlock();
15fe076e
ED
3081 /* prevents packet_notifier() from calling
3082 * register_prot_hook()
3083 */
3084 po->num = 0;
30f7ea1c
FR
3085 __unregister_prot_hook(sk, true);
3086 rcu_read_lock();
3087 dev_curr = po->prot_hook.dev;
3088 if (dev)
3089 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3090 dev->ifindex);
3091 }
1da177e4 3092
15fe076e 3093 BUG_ON(po->running);
902fefb8
DB
3094 po->num = proto;
3095 po->prot_hook.type = proto;
902fefb8 3096
30f7ea1c
FR
3097 if (unlikely(unlisted)) {
3098 dev_put(dev);
3099 po->prot_hook.dev = NULL;
3100 po->ifindex = -1;
3101 packet_cached_dev_reset(po);
3102 } else {
3103 po->prot_hook.dev = dev;
3104 po->ifindex = dev ? dev->ifindex : 0;
3105 packet_cached_dev_assign(po, dev);
3106 }
902fefb8 3107 }
158cd4af
LW
3108 if (dev_curr)
3109 dev_put(dev_curr);
66e56cd4 3110
902fefb8 3111 if (proto == 0 || !need_rehook)
1da177e4
LT
3112 goto out_unlock;
3113
30f7ea1c 3114 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3115 register_prot_hook(sk);
be85d4ad
UT
3116 } else {
3117 sk->sk_err = ENETDOWN;
3118 if (!sock_flag(sk, SOCK_DEAD))
3119 sk->sk_error_report(sk);
1da177e4
LT
3120 }
3121
3122out_unlock:
30f7ea1c 3123 rcu_read_unlock();
1da177e4
LT
3124 spin_unlock(&po->bind_lock);
3125 release_sock(sk);
30f7ea1c 3126 return ret;
1da177e4
LT
3127}
3128
3129/*
3130 * Bind a packet socket to a device
3131 */
3132
40d4e3df
ED
3133static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3134 int addr_len)
1da177e4 3135{
40d4e3df 3136 struct sock *sk = sock->sk;
540e2894 3137 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3138
1da177e4
LT
3139 /*
3140 * Check legality
3141 */
1ce4f28b 3142
8ae55f04 3143 if (addr_len != sizeof(struct sockaddr))
1da177e4 3144 return -EINVAL;
540e2894
AP
3145 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3146 * zero-terminated.
3147 */
3148 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3149 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3150
30f7ea1c 3151 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3152}
1da177e4
LT
3153
3154static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3155{
40d4e3df
ED
3156 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3157 struct sock *sk = sock->sk;
1da177e4
LT
3158
3159 /*
3160 * Check legality
3161 */
1ce4f28b 3162
1da177e4
LT
3163 if (addr_len < sizeof(struct sockaddr_ll))
3164 return -EINVAL;
3165 if (sll->sll_family != AF_PACKET)
3166 return -EINVAL;
3167
30f7ea1c
FR
3168 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3169 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3170}
3171
3172static struct proto packet_proto = {
3173 .name = "PACKET",
3174 .owner = THIS_MODULE,
3175 .obj_size = sizeof(struct packet_sock),
3176};
3177
3178/*
1ce4f28b 3179 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3180 */
3181
3f378b68
EP
3182static int packet_create(struct net *net, struct socket *sock, int protocol,
3183 int kern)
1da177e4
LT
3184{
3185 struct sock *sk;
3186 struct packet_sock *po;
0e11c91e 3187 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3188 int err;
3189
df008c91 3190 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3191 return -EPERM;
be02097c
DM
3192 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3193 sock->type != SOCK_PACKET)
1da177e4
LT
3194 return -ESOCKTNOSUPPORT;
3195
3196 sock->state = SS_UNCONNECTED;
3197
3198 err = -ENOBUFS;
11aa9c28 3199 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3200 if (sk == NULL)
3201 goto out;
3202
3203 sock->ops = &packet_ops;
1da177e4
LT
3204 if (sock->type == SOCK_PACKET)
3205 sock->ops = &packet_ops_spkt;
be02097c 3206
1da177e4
LT
3207 sock_init_data(sock, sk);
3208
3209 po = pkt_sk(sk);
3210 sk->sk_family = PF_PACKET;
0e11c91e 3211 po->num = proto;
d346a3fa 3212 po->xmit = dev_queue_xmit;
66e56cd4 3213
b0138408
DB
3214 err = packet_alloc_pending(po);
3215 if (err)
3216 goto out2;
3217
66e56cd4 3218 packet_cached_dev_reset(po);
1da177e4
LT
3219
3220 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3221 sk_refcnt_debug_inc(sk);
1da177e4
LT
3222
3223 /*
3224 * Attach a protocol block
3225 */
3226
3227 spin_lock_init(&po->bind_lock);
905db440 3228 mutex_init(&po->pg_vec_lock);
0648ab70 3229 po->rollover = NULL;
1da177e4 3230 po->prot_hook.func = packet_rcv;
be02097c 3231
1da177e4
LT
3232 if (sock->type == SOCK_PACKET)
3233 po->prot_hook.func = packet_rcv_spkt;
be02097c 3234
1da177e4
LT
3235 po->prot_hook.af_packet_priv = sk;
3236
0e11c91e
AV
3237 if (proto) {
3238 po->prot_hook.type = proto;
a6361f0c 3239 __register_prot_hook(sk);
1da177e4
LT
3240 }
3241
0fa7fa98 3242 mutex_lock(&net->packet.sklist_lock);
808f5114 3243 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3244 mutex_unlock(&net->packet.sklist_lock);
3245
3246 preempt_disable();
3680453c 3247 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3248 preempt_enable();
808f5114 3249
40d4e3df 3250 return 0;
b0138408
DB
3251out2:
3252 sk_free(sk);
1da177e4
LT
3253out:
3254 return err;
3255}
3256
3257/*
3258 * Pull a packet from our receive queue and hand it to the user.
3259 * If necessary we block.
3260 */
3261
1b784140
YX
3262static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3263 int flags)
1da177e4
LT
3264{
3265 struct sock *sk = sock->sk;
3266 struct sk_buff *skb;
3267 int copied, err;
bfd5f4a3 3268 int vnet_hdr_len = 0;
2472d761 3269 unsigned int origlen = 0;
1da177e4
LT
3270
3271 err = -EINVAL;
ed85b565 3272 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3273 goto out;
3274
3275#if 0
3276 /* What error should we return now? EUNATTACH? */
3277 if (pkt_sk(sk)->ifindex < 0)
3278 return -ENODEV;
3279#endif
3280
ed85b565 3281 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3282 err = sock_recv_errqueue(sk, msg, len,
3283 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3284 goto out;
3285 }
3286
1da177e4
LT
3287 /*
3288 * Call the generic datagram receiver. This handles all sorts
3289 * of horrible races and re-entrancy so we can forget about it
3290 * in the protocol layers.
3291 *
3292 * Now it will return ENETDOWN, if device have just gone down,
3293 * but then it will block.
3294 */
3295
40d4e3df 3296 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3297
3298 /*
1ce4f28b 3299 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3300 * handles the blocking we don't see and worry about blocking
3301 * retries.
3302 */
3303
8ae55f04 3304 if (skb == NULL)
1da177e4
LT
3305 goto out;
3306
2ccdbaa6
WB
3307 if (pkt_sk(sk)->pressure)
3308 packet_rcv_has_room(pkt_sk(sk), NULL);
3309
bfd5f4a3 3310 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3311 err = packet_rcv_vnet(msg, skb, &len);
3312 if (err)
bfd5f4a3 3313 goto out_free;
16cc1400 3314 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3315 }
3316
f3d33426
HFS
3317 /* You lose any data beyond the buffer you gave. If it worries
3318 * a user program they can ask the device for its MTU
3319 * anyway.
1da177e4 3320 */
1da177e4 3321 copied = skb->len;
40d4e3df
ED
3322 if (copied > len) {
3323 copied = len;
3324 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3325 }
3326
51f3d02b 3327 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3328 if (err)
3329 goto out_free;
3330
2472d761
EB
3331 if (sock->type != SOCK_PACKET) {
3332 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3333
3334 /* Original length was stored in sockaddr_ll fields */
3335 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3336 sll->sll_family = AF_PACKET;
3337 sll->sll_protocol = skb->protocol;
3338 }
3339
3b885787 3340 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3341
f3d33426
HFS
3342 if (msg->msg_name) {
3343 /* If the address length field is there to be filled
3344 * in, we fill it in now.
3345 */
3346 if (sock->type == SOCK_PACKET) {
342dfc30 3347 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3348 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3349 } else {
3350 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3351
f3d33426
HFS
3352 msg->msg_namelen = sll->sll_halen +
3353 offsetof(struct sockaddr_ll, sll_addr);
3354 }
ffbc6111
HX
3355 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3356 msg->msg_namelen);
f3d33426 3357 }
1da177e4 3358
8dc41944 3359 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3360 struct tpacket_auxdata aux;
3361
3362 aux.tp_status = TP_STATUS_USER;
3363 if (skb->ip_summed == CHECKSUM_PARTIAL)
3364 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3365 else if (skb->pkt_type != PACKET_OUTGOING &&
3366 (skb->ip_summed == CHECKSUM_COMPLETE ||
3367 skb_csum_unnecessary(skb)))
3368 aux.tp_status |= TP_STATUS_CSUM_VALID;
3369
2472d761 3370 aux.tp_len = origlen;
ffbc6111
HX
3371 aux.tp_snaplen = skb->len;
3372 aux.tp_mac = 0;
bbe735e4 3373 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3374 if (skb_vlan_tag_present(skb)) {
3375 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3376 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3377 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3378 } else {
3379 aux.tp_vlan_tci = 0;
a0cdfcf3 3380 aux.tp_vlan_tpid = 0;
a3bcc23e 3381 }
ffbc6111 3382 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3383 }
3384
1da177e4
LT
3385 /*
3386 * Free or return the buffer as appropriate. Again this
3387 * hides all the races and re-entrancy issues from us.
3388 */
bfd5f4a3 3389 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3390
3391out_free:
3392 skb_free_datagram(sk, skb);
3393out:
3394 return err;
3395}
3396
1da177e4 3397static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3398 int peer)
1da177e4
LT
3399{
3400 struct net_device *dev;
3401 struct sock *sk = sock->sk;
3402
3403 if (peer)
3404 return -EOPNOTSUPP;
3405
3406 uaddr->sa_family = AF_PACKET;
2dc85bf3 3407 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3408 rcu_read_lock();
3409 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3410 if (dev)
2dc85bf3 3411 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3412 rcu_read_unlock();
1da177e4 3413
9b2c45d4 3414 return sizeof(*uaddr);
1da177e4 3415}
1da177e4
LT
3416
3417static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3418 int peer)
1da177e4
LT
3419{
3420 struct net_device *dev;
3421 struct sock *sk = sock->sk;
3422 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3423 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3424
3425 if (peer)
3426 return -EOPNOTSUPP;
3427
3428 sll->sll_family = AF_PACKET;
3429 sll->sll_ifindex = po->ifindex;
3430 sll->sll_protocol = po->num;
67286640 3431 sll->sll_pkttype = 0;
654d1f8a
ED
3432 rcu_read_lock();
3433 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3434 if (dev) {
3435 sll->sll_hatype = dev->type;
3436 sll->sll_halen = dev->addr_len;
3437 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3438 } else {
3439 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3440 sll->sll_halen = 0;
3441 }
654d1f8a 3442 rcu_read_unlock();
1da177e4 3443
9b2c45d4 3444 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3445}
3446
2aeb0b88
WC
3447static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3448 int what)
1da177e4
LT
3449{
3450 switch (i->type) {
3451 case PACKET_MR_MULTICAST:
1162563f
JP
3452 if (i->alen != dev->addr_len)
3453 return -EINVAL;
1da177e4 3454 if (what > 0)
22bedad3 3455 return dev_mc_add(dev, i->addr);
1da177e4 3456 else
22bedad3 3457 return dev_mc_del(dev, i->addr);
1da177e4
LT
3458 break;
3459 case PACKET_MR_PROMISC:
2aeb0b88 3460 return dev_set_promiscuity(dev, what);
1da177e4 3461 case PACKET_MR_ALLMULTI:
2aeb0b88 3462 return dev_set_allmulti(dev, what);
d95ed927 3463 case PACKET_MR_UNICAST:
1162563f
JP
3464 if (i->alen != dev->addr_len)
3465 return -EINVAL;
d95ed927 3466 if (what > 0)
a748ee24 3467 return dev_uc_add(dev, i->addr);
d95ed927 3468 else
a748ee24 3469 return dev_uc_del(dev, i->addr);
d95ed927 3470 break;
40d4e3df
ED
3471 default:
3472 break;
1da177e4 3473 }
2aeb0b88 3474 return 0;
1da177e4
LT
3475}
3476
82f17091
FR
3477static void packet_dev_mclist_delete(struct net_device *dev,
3478 struct packet_mclist **mlp)
1da177e4 3479{
82f17091
FR
3480 struct packet_mclist *ml;
3481
3482 while ((ml = *mlp) != NULL) {
3483 if (ml->ifindex == dev->ifindex) {
3484 packet_dev_mc(dev, ml, -1);
3485 *mlp = ml->next;
3486 kfree(ml);
3487 } else
3488 mlp = &ml->next;
1da177e4
LT
3489 }
3490}
3491
0fb375fb 3492static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3493{
3494 struct packet_sock *po = pkt_sk(sk);
3495 struct packet_mclist *ml, *i;
3496 struct net_device *dev;
3497 int err;
3498
3499 rtnl_lock();
3500
3501 err = -ENODEV;
3b1e0a65 3502 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3503 if (!dev)
3504 goto done;
3505
3506 err = -EINVAL;
1162563f 3507 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3508 goto done;
3509
3510 err = -ENOBUFS;
8b3a7005 3511 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3512 if (i == NULL)
3513 goto done;
3514
3515 err = 0;
3516 for (ml = po->mclist; ml; ml = ml->next) {
3517 if (ml->ifindex == mreq->mr_ifindex &&
3518 ml->type == mreq->mr_type &&
3519 ml->alen == mreq->mr_alen &&
3520 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3521 ml->count++;
3522 /* Free the new element ... */
3523 kfree(i);
3524 goto done;
3525 }
3526 }
3527
3528 i->type = mreq->mr_type;
3529 i->ifindex = mreq->mr_ifindex;
3530 i->alen = mreq->mr_alen;
3531 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3532 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3533 i->count = 1;
3534 i->next = po->mclist;
3535 po->mclist = i;
2aeb0b88
WC
3536 err = packet_dev_mc(dev, i, 1);
3537 if (err) {
3538 po->mclist = i->next;
3539 kfree(i);
3540 }
1da177e4
LT
3541
3542done:
3543 rtnl_unlock();
3544 return err;
3545}
3546
0fb375fb 3547static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3548{
3549 struct packet_mclist *ml, **mlp;
3550
3551 rtnl_lock();
3552
3553 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3554 if (ml->ifindex == mreq->mr_ifindex &&
3555 ml->type == mreq->mr_type &&
3556 ml->alen == mreq->mr_alen &&
3557 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3558 if (--ml->count == 0) {
3559 struct net_device *dev;
3560 *mlp = ml->next;
ad959e76
ED
3561 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3562 if (dev)
1da177e4 3563 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3564 kfree(ml);
3565 }
82f17091 3566 break;
1da177e4
LT
3567 }
3568 }
3569 rtnl_unlock();
82f17091 3570 return 0;
1da177e4
LT
3571}
3572
3573static void packet_flush_mclist(struct sock *sk)
3574{
3575 struct packet_sock *po = pkt_sk(sk);
3576 struct packet_mclist *ml;
3577
3578 if (!po->mclist)
3579 return;
3580
3581 rtnl_lock();
3582 while ((ml = po->mclist) != NULL) {
3583 struct net_device *dev;
3584
3585 po->mclist = ml->next;
ad959e76
ED
3586 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3587 if (dev != NULL)
1da177e4 3588 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3589 kfree(ml);
3590 }
3591 rtnl_unlock();
3592}
1da177e4
LT
3593
3594static int
b7058842 3595packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3596{
3597 struct sock *sk = sock->sk;
8dc41944 3598 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3599 int ret;
3600
3601 if (level != SOL_PACKET)
3602 return -ENOPROTOOPT;
3603
69e3c75f 3604 switch (optname) {
1ce4f28b 3605 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3606 case PACKET_DROP_MEMBERSHIP:
3607 {
0fb375fb
EB
3608 struct packet_mreq_max mreq;
3609 int len = optlen;
3610 memset(&mreq, 0, sizeof(mreq));
3611 if (len < sizeof(struct packet_mreq))
1da177e4 3612 return -EINVAL;
0fb375fb
EB
3613 if (len > sizeof(mreq))
3614 len = sizeof(mreq);
40d4e3df 3615 if (copy_from_user(&mreq, optval, len))
1da177e4 3616 return -EFAULT;
0fb375fb
EB
3617 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3618 return -EINVAL;
1da177e4
LT
3619 if (optname == PACKET_ADD_MEMBERSHIP)
3620 ret = packet_mc_add(sk, &mreq);
3621 else
3622 ret = packet_mc_drop(sk, &mreq);
3623 return ret;
3624 }
a2efcfa0 3625
1da177e4 3626 case PACKET_RX_RING:
69e3c75f 3627 case PACKET_TX_RING:
1da177e4 3628 {
f6fb8f10 3629 union tpacket_req_u req_u;
3630 int len;
1da177e4 3631
5171b37d 3632 lock_sock(sk);
f6fb8f10 3633 switch (po->tp_version) {
3634 case TPACKET_V1:
3635 case TPACKET_V2:
3636 len = sizeof(req_u.req);
3637 break;
3638 case TPACKET_V3:
3639 default:
3640 len = sizeof(req_u.req3);
3641 break;
3642 }
5171b37d
ED
3643 if (optlen < len) {
3644 ret = -EINVAL;
3645 } else {
3646 if (copy_from_user(&req_u.req, optval, len))
3647 ret = -EFAULT;
3648 else
3649 ret = packet_set_ring(sk, &req_u, 0,
3650 optname == PACKET_TX_RING);
3651 }
3652 release_sock(sk);
3653 return ret;
1da177e4
LT
3654 }
3655 case PACKET_COPY_THRESH:
3656 {
3657 int val;
3658
40d4e3df 3659 if (optlen != sizeof(val))
1da177e4 3660 return -EINVAL;
40d4e3df 3661 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3662 return -EFAULT;
3663
3664 pkt_sk(sk)->copy_thresh = val;
3665 return 0;
3666 }
bbd6ef87
PM
3667 case PACKET_VERSION:
3668 {
3669 int val;
3670
3671 if (optlen != sizeof(val))
3672 return -EINVAL;
bbd6ef87
PM
3673 if (copy_from_user(&val, optval, sizeof(val)))
3674 return -EFAULT;
3675 switch (val) {
3676 case TPACKET_V1:
3677 case TPACKET_V2:
f6fb8f10 3678 case TPACKET_V3:
84ac7260 3679 break;
bbd6ef87
PM
3680 default:
3681 return -EINVAL;
3682 }
84ac7260
PP
3683 lock_sock(sk);
3684 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3685 ret = -EBUSY;
3686 } else {
3687 po->tp_version = val;
3688 ret = 0;
3689 }
3690 release_sock(sk);
3691 return ret;
bbd6ef87 3692 }
8913336a
PM
3693 case PACKET_RESERVE:
3694 {
3695 unsigned int val;
3696
3697 if (optlen != sizeof(val))
3698 return -EINVAL;
8913336a
PM
3699 if (copy_from_user(&val, optval, sizeof(val)))
3700 return -EFAULT;
bcc5364b
AK
3701 if (val > INT_MAX)
3702 return -EINVAL;
c27927e3
WB
3703 lock_sock(sk);
3704 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3705 ret = -EBUSY;
3706 } else {
3707 po->tp_reserve = val;
3708 ret = 0;
3709 }
3710 release_sock(sk);
3711 return ret;
8913336a 3712 }
69e3c75f
JB
3713 case PACKET_LOSS:
3714 {
3715 unsigned int val;
3716
3717 if (optlen != sizeof(val))
3718 return -EINVAL;
69e3c75f
JB
3719 if (copy_from_user(&val, optval, sizeof(val)))
3720 return -EFAULT;
a6361f0c
WB
3721
3722 lock_sock(sk);
3723 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3724 ret = -EBUSY;
3725 } else {
3726 po->tp_loss = !!val;
3727 ret = 0;
3728 }
3729 release_sock(sk);
3730 return ret;
69e3c75f 3731 }
8dc41944
HX
3732 case PACKET_AUXDATA:
3733 {
3734 int val;
3735
3736 if (optlen < sizeof(val))
3737 return -EINVAL;
3738 if (copy_from_user(&val, optval, sizeof(val)))
3739 return -EFAULT;
3740
a6361f0c 3741 lock_sock(sk);
8dc41944 3742 po->auxdata = !!val;
a6361f0c 3743 release_sock(sk);
8dc41944
HX
3744 return 0;
3745 }
80feaacb
PWJ
3746 case PACKET_ORIGDEV:
3747 {
3748 int val;
3749
3750 if (optlen < sizeof(val))
3751 return -EINVAL;
3752 if (copy_from_user(&val, optval, sizeof(val)))
3753 return -EFAULT;
3754
a6361f0c 3755 lock_sock(sk);
80feaacb 3756 po->origdev = !!val;
a6361f0c 3757 release_sock(sk);
80feaacb
PWJ
3758 return 0;
3759 }
bfd5f4a3
SS
3760 case PACKET_VNET_HDR:
3761 {
3762 int val;
3763
3764 if (sock->type != SOCK_RAW)
3765 return -EINVAL;
bfd5f4a3
SS
3766 if (optlen < sizeof(val))
3767 return -EINVAL;
3768 if (copy_from_user(&val, optval, sizeof(val)))
3769 return -EFAULT;
3770
a6361f0c
WB
3771 lock_sock(sk);
3772 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3773 ret = -EBUSY;
3774 } else {
3775 po->has_vnet_hdr = !!val;
3776 ret = 0;
3777 }
3778 release_sock(sk);
3779 return ret;
bfd5f4a3 3780 }
614f60fa
SM
3781 case PACKET_TIMESTAMP:
3782 {
3783 int val;
3784
3785 if (optlen != sizeof(val))
3786 return -EINVAL;
3787 if (copy_from_user(&val, optval, sizeof(val)))
3788 return -EFAULT;
3789
3790 po->tp_tstamp = val;
3791 return 0;
3792 }
dc99f600
DM
3793 case PACKET_FANOUT:
3794 {
3795 int val;
3796
3797 if (optlen != sizeof(val))
3798 return -EINVAL;
3799 if (copy_from_user(&val, optval, sizeof(val)))
3800 return -EFAULT;
3801
3802 return fanout_add(sk, val & 0xffff, val >> 16);
3803 }
47dceb8e
WB
3804 case PACKET_FANOUT_DATA:
3805 {
3806 if (!po->fanout)
3807 return -EINVAL;
3808
3809 return fanout_set_data(po, optval, optlen);
3810 }
fa788d98
VW
3811 case PACKET_IGNORE_OUTGOING:
3812 {
3813 int val;
3814
3815 if (optlen != sizeof(val))
3816 return -EINVAL;
3817 if (copy_from_user(&val, optval, sizeof(val)))
3818 return -EFAULT;
3819 if (val < 0 || val > 1)
3820 return -EINVAL;
3821
3822 po->prot_hook.ignore_outgoing = !!val;
3823 return 0;
3824 }
5920cd3a
PC
3825 case PACKET_TX_HAS_OFF:
3826 {
3827 unsigned int val;
3828
3829 if (optlen != sizeof(val))
3830 return -EINVAL;
5920cd3a
PC
3831 if (copy_from_user(&val, optval, sizeof(val)))
3832 return -EFAULT;
a6361f0c
WB
3833
3834 lock_sock(sk);
3835 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3836 ret = -EBUSY;
3837 } else {
3838 po->tp_tx_has_off = !!val;
3839 ret = 0;
3840 }
3841 release_sock(sk);
5920cd3a
PC
3842 return 0;
3843 }
d346a3fa
DB
3844 case PACKET_QDISC_BYPASS:
3845 {
3846 int val;
3847
3848 if (optlen != sizeof(val))
3849 return -EINVAL;
3850 if (copy_from_user(&val, optval, sizeof(val)))
3851 return -EFAULT;
3852
3853 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3854 return 0;
3855 }
1da177e4
LT
3856 default:
3857 return -ENOPROTOOPT;
3858 }
3859}
3860
3861static int packet_getsockopt(struct socket *sock, int level, int optname,
3862 char __user *optval, int __user *optlen)
3863{
3864 int len;
c06fff6e 3865 int val, lv = sizeof(val);
1da177e4
LT
3866 struct sock *sk = sock->sk;
3867 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3868 void *data = &val;
ee80fbf3 3869 union tpacket_stats_u st;
a9b63918 3870 struct tpacket_rollover_stats rstats;
1da177e4
LT
3871
3872 if (level != SOL_PACKET)
3873 return -ENOPROTOOPT;
3874
8ae55f04
KK
3875 if (get_user(len, optlen))
3876 return -EFAULT;
1da177e4
LT
3877
3878 if (len < 0)
3879 return -EINVAL;
1ce4f28b 3880
69e3c75f 3881 switch (optname) {
1da177e4 3882 case PACKET_STATISTICS:
1da177e4 3883 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3884 memcpy(&st, &po->stats, sizeof(st));
3885 memset(&po->stats, 0, sizeof(po->stats));
3886 spin_unlock_bh(&sk->sk_receive_queue.lock);
3887
f6fb8f10 3888 if (po->tp_version == TPACKET_V3) {
c06fff6e 3889 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3890 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3891 data = &st.stats3;
f6fb8f10 3892 } else {
c06fff6e 3893 lv = sizeof(struct tpacket_stats);
8bcdeaff 3894 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3895 data = &st.stats1;
f6fb8f10 3896 }
ee80fbf3 3897
8dc41944
HX
3898 break;
3899 case PACKET_AUXDATA:
8dc41944 3900 val = po->auxdata;
80feaacb
PWJ
3901 break;
3902 case PACKET_ORIGDEV:
80feaacb 3903 val = po->origdev;
bfd5f4a3
SS
3904 break;
3905 case PACKET_VNET_HDR:
bfd5f4a3 3906 val = po->has_vnet_hdr;
1da177e4 3907 break;
bbd6ef87 3908 case PACKET_VERSION:
bbd6ef87 3909 val = po->tp_version;
bbd6ef87
PM
3910 break;
3911 case PACKET_HDRLEN:
3912 if (len > sizeof(int))
3913 len = sizeof(int);
fd2c83b3
AP
3914 if (len < sizeof(int))
3915 return -EINVAL;
bbd6ef87
PM
3916 if (copy_from_user(&val, optval, len))
3917 return -EFAULT;
3918 switch (val) {
3919 case TPACKET_V1:
3920 val = sizeof(struct tpacket_hdr);
3921 break;
3922 case TPACKET_V2:
3923 val = sizeof(struct tpacket2_hdr);
3924 break;
f6fb8f10 3925 case TPACKET_V3:
3926 val = sizeof(struct tpacket3_hdr);
3927 break;
bbd6ef87
PM
3928 default:
3929 return -EINVAL;
3930 }
bbd6ef87 3931 break;
8913336a 3932 case PACKET_RESERVE:
8913336a 3933 val = po->tp_reserve;
8913336a 3934 break;
69e3c75f 3935 case PACKET_LOSS:
69e3c75f 3936 val = po->tp_loss;
69e3c75f 3937 break;
614f60fa 3938 case PACKET_TIMESTAMP:
614f60fa 3939 val = po->tp_tstamp;
614f60fa 3940 break;
dc99f600 3941 case PACKET_FANOUT:
dc99f600
DM
3942 val = (po->fanout ?
3943 ((u32)po->fanout->id |
77f65ebd
WB
3944 ((u32)po->fanout->type << 16) |
3945 ((u32)po->fanout->flags << 24)) :
dc99f600 3946 0);
dc99f600 3947 break;
fa788d98
VW
3948 case PACKET_IGNORE_OUTGOING:
3949 val = po->prot_hook.ignore_outgoing;
3950 break;
a9b63918 3951 case PACKET_ROLLOVER_STATS:
57f015f5 3952 if (!po->rollover)
a9b63918 3953 return -EINVAL;
57f015f5
MM
3954 rstats.tp_all = atomic_long_read(&po->rollover->num);
3955 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3956 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3957 data = &rstats;
3958 lv = sizeof(rstats);
a9b63918 3959 break;
5920cd3a
PC
3960 case PACKET_TX_HAS_OFF:
3961 val = po->tp_tx_has_off;
3962 break;
d346a3fa
DB
3963 case PACKET_QDISC_BYPASS:
3964 val = packet_use_direct_xmit(po);
3965 break;
1da177e4
LT
3966 default:
3967 return -ENOPROTOOPT;
3968 }
3969
c06fff6e
ED
3970 if (len > lv)
3971 len = lv;
8ae55f04
KK
3972 if (put_user(len, optlen))
3973 return -EFAULT;
8dc41944
HX
3974 if (copy_to_user(optval, data, len))
3975 return -EFAULT;
8ae55f04 3976 return 0;
1da177e4
LT
3977}
3978
3979
719c44d3
WB
3980#ifdef CONFIG_COMPAT
3981static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3982 char __user *optval, unsigned int optlen)
3983{
3984 struct packet_sock *po = pkt_sk(sock->sk);
3985
3986 if (level != SOL_PACKET)
3987 return -ENOPROTOOPT;
3988
3989 if (optname == PACKET_FANOUT_DATA &&
3990 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3991 optval = (char __user *)get_compat_bpf_fprog(optval);
3992 if (!optval)
3993 return -EFAULT;
3994 optlen = sizeof(struct sock_fprog);
3995 }
3996
3997 return packet_setsockopt(sock, level, optname, optval, optlen);
3998}
3999#endif
4000
351638e7
JP
4001static int packet_notifier(struct notifier_block *this,
4002 unsigned long msg, void *ptr)
1da177e4
LT
4003{
4004 struct sock *sk;
351638e7 4005 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4006 struct net *net = dev_net(dev);
1da177e4 4007
808f5114 4008 rcu_read_lock();
b67bfe0d 4009 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4010 struct packet_sock *po = pkt_sk(sk);
4011
4012 switch (msg) {
4013 case NETDEV_UNREGISTER:
1da177e4 4014 if (po->mclist)
82f17091 4015 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4016 /* fallthrough */
4017
1da177e4
LT
4018 case NETDEV_DOWN:
4019 if (dev->ifindex == po->ifindex) {
4020 spin_lock(&po->bind_lock);
4021 if (po->running) {
ce06b03e 4022 __unregister_prot_hook(sk, false);
1da177e4
LT
4023 sk->sk_err = ENETDOWN;
4024 if (!sock_flag(sk, SOCK_DEAD))
4025 sk->sk_error_report(sk);
4026 }
4027 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4028 packet_cached_dev_reset(po);
1da177e4 4029 po->ifindex = -1;
160ff18a
BG
4030 if (po->prot_hook.dev)
4031 dev_put(po->prot_hook.dev);
1da177e4
LT
4032 po->prot_hook.dev = NULL;
4033 }
4034 spin_unlock(&po->bind_lock);
4035 }
4036 break;
4037 case NETDEV_UP:
808f5114 4038 if (dev->ifindex == po->ifindex) {
4039 spin_lock(&po->bind_lock);
ce06b03e
DM
4040 if (po->num)
4041 register_prot_hook(sk);
808f5114 4042 spin_unlock(&po->bind_lock);
1da177e4 4043 }
1da177e4
LT
4044 break;
4045 }
4046 }
808f5114 4047 rcu_read_unlock();
1da177e4
LT
4048 return NOTIFY_DONE;
4049}
4050
4051
4052static int packet_ioctl(struct socket *sock, unsigned int cmd,
4053 unsigned long arg)
4054{
4055 struct sock *sk = sock->sk;
4056
69e3c75f 4057 switch (cmd) {
40d4e3df
ED
4058 case SIOCOUTQ:
4059 {
4060 int amount = sk_wmem_alloc_get(sk);
31e6d363 4061
40d4e3df
ED
4062 return put_user(amount, (int __user *)arg);
4063 }
4064 case SIOCINQ:
4065 {
4066 struct sk_buff *skb;
4067 int amount = 0;
4068
4069 spin_lock_bh(&sk->sk_receive_queue.lock);
4070 skb = skb_peek(&sk->sk_receive_queue);
4071 if (skb)
4072 amount = skb->len;
4073 spin_unlock_bh(&sk->sk_receive_queue.lock);
4074 return put_user(amount, (int __user *)arg);
4075 }
4076 case SIOCGSTAMP:
4077 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4078 case SIOCGSTAMPNS:
4079 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4080
1da177e4 4081#ifdef CONFIG_INET
40d4e3df
ED
4082 case SIOCADDRT:
4083 case SIOCDELRT:
4084 case SIOCDARP:
4085 case SIOCGARP:
4086 case SIOCSARP:
4087 case SIOCGIFADDR:
4088 case SIOCSIFADDR:
4089 case SIOCGIFBRDADDR:
4090 case SIOCSIFBRDADDR:
4091 case SIOCGIFNETMASK:
4092 case SIOCSIFNETMASK:
4093 case SIOCGIFDSTADDR:
4094 case SIOCSIFDSTADDR:
4095 case SIOCSIFFLAGS:
40d4e3df 4096 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4097#endif
4098
40d4e3df
ED
4099 default:
4100 return -ENOIOCTLCMD;
1da177e4
LT
4101 }
4102 return 0;
4103}
4104
a11e1d43
LT
4105static __poll_t packet_poll(struct file *file, struct socket *sock,
4106 poll_table *wait)
1da177e4
LT
4107{
4108 struct sock *sk = sock->sk;
4109 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4110 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4111
4112 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4113 if (po->rx_ring.pg_vec) {
f6fb8f10 4114 if (!packet_previous_rx_frame(po, &po->rx_ring,
4115 TP_STATUS_KERNEL))
a9a08845 4116 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4117 }
2ccdbaa6 4118 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4119 po->pressure = 0;
1da177e4 4120 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4121 spin_lock_bh(&sk->sk_write_queue.lock);
4122 if (po->tx_ring.pg_vec) {
4123 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4124 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4125 }
4126 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4127 return mask;
4128}
4129
4130
4131/* Dirty? Well, I still did not learn better way to account
4132 * for user mmaps.
4133 */
4134
4135static void packet_mm_open(struct vm_area_struct *vma)
4136{
4137 struct file *file = vma->vm_file;
40d4e3df 4138 struct socket *sock = file->private_data;
1da177e4 4139 struct sock *sk = sock->sk;
1ce4f28b 4140
1da177e4
LT
4141 if (sk)
4142 atomic_inc(&pkt_sk(sk)->mapped);
4143}
4144
4145static void packet_mm_close(struct vm_area_struct *vma)
4146{
4147 struct file *file = vma->vm_file;
40d4e3df 4148 struct socket *sock = file->private_data;
1da177e4 4149 struct sock *sk = sock->sk;
1ce4f28b 4150
1da177e4
LT
4151 if (sk)
4152 atomic_dec(&pkt_sk(sk)->mapped);
4153}
4154
f0f37e2f 4155static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4156 .open = packet_mm_open,
4157 .close = packet_mm_close,
1da177e4
LT
4158};
4159
3a7ad063
ED
4160static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4161 unsigned int len)
1da177e4
LT
4162{
4163 int i;
4164
4ebf0ae2 4165 for (i = 0; i < len; i++) {
0e3125c7 4166 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4167 if (is_vmalloc_addr(pg_vec[i].buffer))
4168 vfree(pg_vec[i].buffer);
4169 else
4170 free_pages((unsigned long)pg_vec[i].buffer,
4171 order);
0e3125c7
NH
4172 pg_vec[i].buffer = NULL;
4173 }
1da177e4
LT
4174 }
4175 kfree(pg_vec);
4176}
4177
3a7ad063 4178static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4179{
f0d4eb29 4180 char *buffer;
3a7ad063
ED
4181 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4182 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4183
3a7ad063 4184 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4185 if (buffer)
4186 return buffer;
4187
3a7ad063
ED
4188 /* __get_free_pages failed, fall back to vmalloc */
4189 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4190 if (buffer)
4191 return buffer;
0e3125c7 4192
3a7ad063
ED
4193 /* vmalloc failed, lets dig into swap here */
4194 gfp_flags &= ~__GFP_NORETRY;
4195 buffer = (char *) __get_free_pages(gfp_flags, order);
4196 if (buffer)
4197 return buffer;
4198
4199 /* complete and utter failure */
4200 return NULL;
4ebf0ae2
DM
4201}
4202
3a7ad063 4203static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4204{
4205 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4206 struct pgv *pg_vec;
4ebf0ae2
DM
4207 int i;
4208
0e3125c7 4209 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4210 if (unlikely(!pg_vec))
4211 goto out;
4212
4213 for (i = 0; i < block_nr; i++) {
3a7ad063 4214 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4215 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4216 goto out_free_pgvec;
4217 }
4218
4219out:
4220 return pg_vec;
4221
4222out_free_pgvec:
3a7ad063 4223 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4224 pg_vec = NULL;
4225 goto out;
4226}
1da177e4 4227
f6fb8f10 4228static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4229 int closing, int tx_ring)
1da177e4 4230{
0e3125c7 4231 struct pgv *pg_vec = NULL;
1da177e4 4232 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4233 int was_running, order = 0;
69e3c75f
JB
4234 struct packet_ring_buffer *rb;
4235 struct sk_buff_head *rb_queue;
0e11c91e 4236 __be16 num;
f6fb8f10 4237 int err = -EINVAL;
4238 /* Added to avoid minimal code churn */
4239 struct tpacket_req *req = &req_u->req;
4240
69e3c75f
JB
4241 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4242 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4243
69e3c75f
JB
4244 err = -EBUSY;
4245 if (!closing) {
4246 if (atomic_read(&po->mapped))
4247 goto out;
b0138408 4248 if (packet_read_pending(rb))
69e3c75f
JB
4249 goto out;
4250 }
1da177e4 4251
69e3c75f 4252 if (req->tp_block_nr) {
4576cd46
WB
4253 unsigned int min_frame_size;
4254
69e3c75f
JB
4255 /* Sanity tests and some calculations */
4256 err = -EBUSY;
4257 if (unlikely(rb->pg_vec))
4258 goto out;
1da177e4 4259
bbd6ef87
PM
4260 switch (po->tp_version) {
4261 case TPACKET_V1:
4262 po->tp_hdrlen = TPACKET_HDRLEN;
4263 break;
4264 case TPACKET_V2:
4265 po->tp_hdrlen = TPACKET2_HDRLEN;
4266 break;
f6fb8f10 4267 case TPACKET_V3:
4268 po->tp_hdrlen = TPACKET3_HDRLEN;
4269 break;
bbd6ef87
PM
4270 }
4271
69e3c75f 4272 err = -EINVAL;
4ebf0ae2 4273 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4274 goto out;
90836b67 4275 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4276 goto out;
4576cd46 4277 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4278 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4279 req->tp_block_size <
4280 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4281 goto out;
4576cd46 4282 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4283 goto out;
4ebf0ae2 4284 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4285 goto out;
1da177e4 4286
4194b491
TK
4287 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4288 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4289 goto out;
8f8d28e4
AK
4290 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4291 goto out;
69e3c75f
JB
4292 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4293 req->tp_frame_nr))
4294 goto out;
1da177e4
LT
4295
4296 err = -ENOMEM;
3a7ad063
ED
4297 order = get_order(req->tp_block_size);
4298 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4299 if (unlikely(!pg_vec))
1da177e4 4300 goto out;
f6fb8f10 4301 switch (po->tp_version) {
4302 case TPACKET_V3:
7f953ab2
SV
4303 /* Block transmit is not supported yet */
4304 if (!tx_ring) {
e8e85cc5 4305 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4306 } else {
4307 struct tpacket_req3 *req3 = &req_u->req3;
4308
4309 if (req3->tp_retire_blk_tov ||
4310 req3->tp_sizeof_priv ||
4311 req3->tp_feature_req_word) {
4312 err = -EINVAL;
4313 goto out;
4314 }
4315 }
d7cf0c34 4316 break;
f6fb8f10 4317 default:
4318 break;
4319 }
69e3c75f
JB
4320 }
4321 /* Done */
4322 else {
4323 err = -EINVAL;
4ebf0ae2 4324 if (unlikely(req->tp_frame_nr))
69e3c75f 4325 goto out;
1da177e4
LT
4326 }
4327
1da177e4
LT
4328
4329 /* Detach socket from network */
4330 spin_lock(&po->bind_lock);
4331 was_running = po->running;
4332 num = po->num;
4333 if (was_running) {
1da177e4 4334 po->num = 0;
ce06b03e 4335 __unregister_prot_hook(sk, false);
1da177e4
LT
4336 }
4337 spin_unlock(&po->bind_lock);
1ce4f28b 4338
1da177e4
LT
4339 synchronize_net();
4340
4341 err = -EBUSY;
905db440 4342 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4343 if (closing || atomic_read(&po->mapped) == 0) {
4344 err = 0;
69e3c75f 4345 spin_lock_bh(&rb_queue->lock);
c053fd96 4346 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4347 rb->frame_max = (req->tp_frame_nr - 1);
4348 rb->head = 0;
4349 rb->frame_size = req->tp_frame_size;
4350 spin_unlock_bh(&rb_queue->lock);
4351
3a7ad063 4352 swap(rb->pg_vec_order, order);
c053fd96 4353 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4354
4355 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4356 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4357 tpacket_rcv : packet_rcv;
4358 skb_queue_purge(rb_queue);
1da177e4 4359 if (atomic_read(&po->mapped))
40d4e3df
ED
4360 pr_err("packet_mmap: vma is busy: %d\n",
4361 atomic_read(&po->mapped));
1da177e4 4362 }
905db440 4363 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4364
4365 spin_lock(&po->bind_lock);
ce06b03e 4366 if (was_running) {
1da177e4 4367 po->num = num;
ce06b03e 4368 register_prot_hook(sk);
1da177e4
LT
4369 }
4370 spin_unlock(&po->bind_lock);
c800aaf8 4371 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4372 /* Because we don't support block-based V3 on tx-ring */
4373 if (!tx_ring)
73d0fcf2 4374 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4375 }
1da177e4 4376
1da177e4 4377 if (pg_vec)
3a7ad063 4378 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4379out:
4380 return err;
4381}
4382
69e3c75f
JB
4383static int packet_mmap(struct file *file, struct socket *sock,
4384 struct vm_area_struct *vma)
1da177e4
LT
4385{
4386 struct sock *sk = sock->sk;
4387 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4388 unsigned long size, expected_size;
4389 struct packet_ring_buffer *rb;
1da177e4
LT
4390 unsigned long start;
4391 int err = -EINVAL;
4392 int i;
4393
4394 if (vma->vm_pgoff)
4395 return -EINVAL;
4396
905db440 4397 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4398
4399 expected_size = 0;
4400 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4401 if (rb->pg_vec) {
4402 expected_size += rb->pg_vec_len
4403 * rb->pg_vec_pages
4404 * PAGE_SIZE;
4405 }
4406 }
4407
4408 if (expected_size == 0)
1da177e4 4409 goto out;
69e3c75f
JB
4410
4411 size = vma->vm_end - vma->vm_start;
4412 if (size != expected_size)
1da177e4
LT
4413 goto out;
4414
1da177e4 4415 start = vma->vm_start;
69e3c75f
JB
4416 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4417 if (rb->pg_vec == NULL)
4418 continue;
4419
4420 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4421 struct page *page;
4422 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4423 int pg_num;
4424
c56b4d90
CG
4425 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4426 page = pgv_to_page(kaddr);
69e3c75f
JB
4427 err = vm_insert_page(vma, start, page);
4428 if (unlikely(err))
4429 goto out;
4430 start += PAGE_SIZE;
0e3125c7 4431 kaddr += PAGE_SIZE;
69e3c75f 4432 }
4ebf0ae2 4433 }
1da177e4 4434 }
69e3c75f 4435
4ebf0ae2 4436 atomic_inc(&po->mapped);
1da177e4
LT
4437 vma->vm_ops = &packet_mmap_ops;
4438 err = 0;
4439
4440out:
905db440 4441 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4442 return err;
4443}
1da177e4 4444
90ddc4f0 4445static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4446 .family = PF_PACKET,
4447 .owner = THIS_MODULE,
4448 .release = packet_release,
4449 .bind = packet_bind_spkt,
4450 .connect = sock_no_connect,
4451 .socketpair = sock_no_socketpair,
4452 .accept = sock_no_accept,
4453 .getname = packet_getname_spkt,
a11e1d43 4454 .poll = datagram_poll,
1da177e4
LT
4455 .ioctl = packet_ioctl,
4456 .listen = sock_no_listen,
4457 .shutdown = sock_no_shutdown,
4458 .setsockopt = sock_no_setsockopt,
4459 .getsockopt = sock_no_getsockopt,
4460 .sendmsg = packet_sendmsg_spkt,
4461 .recvmsg = packet_recvmsg,
4462 .mmap = sock_no_mmap,
4463 .sendpage = sock_no_sendpage,
4464};
1da177e4 4465
90ddc4f0 4466static const struct proto_ops packet_ops = {
1da177e4
LT
4467 .family = PF_PACKET,
4468 .owner = THIS_MODULE,
4469 .release = packet_release,
4470 .bind = packet_bind,
4471 .connect = sock_no_connect,
4472 .socketpair = sock_no_socketpair,
4473 .accept = sock_no_accept,
1ce4f28b 4474 .getname = packet_getname,
a11e1d43 4475 .poll = packet_poll,
1da177e4
LT
4476 .ioctl = packet_ioctl,
4477 .listen = sock_no_listen,
4478 .shutdown = sock_no_shutdown,
4479 .setsockopt = packet_setsockopt,
4480 .getsockopt = packet_getsockopt,
719c44d3
WB
4481#ifdef CONFIG_COMPAT
4482 .compat_setsockopt = compat_packet_setsockopt,
4483#endif
1da177e4
LT
4484 .sendmsg = packet_sendmsg,
4485 .recvmsg = packet_recvmsg,
4486 .mmap = packet_mmap,
4487 .sendpage = sock_no_sendpage,
4488};
4489
ec1b4cf7 4490static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4491 .family = PF_PACKET,
4492 .create = packet_create,
4493 .owner = THIS_MODULE,
4494};
4495
4496static struct notifier_block packet_netdev_notifier = {
40d4e3df 4497 .notifier_call = packet_notifier,
1da177e4
LT
4498};
4499
4500#ifdef CONFIG_PROC_FS
1da177e4
LT
4501
4502static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4503 __acquires(RCU)
1da177e4 4504{
e372c414 4505 struct net *net = seq_file_net(seq);
808f5114 4506
4507 rcu_read_lock();
4508 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4509}
4510
4511static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4512{
1bf40954 4513 struct net *net = seq_file_net(seq);
808f5114 4514 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4515}
4516
4517static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4518 __releases(RCU)
1da177e4 4519{
808f5114 4520 rcu_read_unlock();
1da177e4
LT
4521}
4522
1ce4f28b 4523static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4524{
4525 if (v == SEQ_START_TOKEN)
4526 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4527 else {
b7ceabd9 4528 struct sock *s = sk_entry(v);
1da177e4
LT
4529 const struct packet_sock *po = pkt_sk(s);
4530
4531 seq_printf(seq,
71338aa7 4532 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4533 s,
41c6d650 4534 refcount_read(&s->sk_refcnt),
1da177e4
LT
4535 s->sk_type,
4536 ntohs(po->num),
4537 po->ifindex,
4538 po->running,
4539 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4540 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4541 sock_i_ino(s));
1da177e4
LT
4542 }
4543
4544 return 0;
4545}
4546
56b3d975 4547static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4548 .start = packet_seq_start,
4549 .next = packet_seq_next,
4550 .stop = packet_seq_stop,
4551 .show = packet_seq_show,
4552};
1da177e4
LT
4553#endif
4554
2c8c1e72 4555static int __net_init packet_net_init(struct net *net)
d12d01d6 4556{
0fa7fa98 4557 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4558 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4559
c3506372
CH
4560 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4561 sizeof(struct seq_net_private)))
d12d01d6
DL
4562 return -ENOMEM;
4563
4564 return 0;
4565}
4566
2c8c1e72 4567static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4568{
ece31ffd 4569 remove_proc_entry("packet", net->proc_net);
669f8f1a 4570 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4571}
4572
4573static struct pernet_operations packet_net_ops = {
4574 .init = packet_net_init,
4575 .exit = packet_net_exit,
4576};
4577
4578
1da177e4
LT
4579static void __exit packet_exit(void)
4580{
1da177e4 4581 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4582 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4583 sock_unregister(PF_PACKET);
4584 proto_unregister(&packet_proto);
4585}
4586
4587static int __init packet_init(void)
4588{
4589 int rc = proto_register(&packet_proto, 0);
4590
4591 if (rc != 0)
4592 goto out;
4593
4594 sock_register(&packet_family_ops);
d12d01d6 4595 register_pernet_subsys(&packet_net_ops);
1da177e4 4596 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4597out:
4598 return rc;
4599}
4600
4601module_init(packet_init);
4602module_exit(packet_exit);
4603MODULE_LICENSE("GPL");
4604MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 2.069944 seconds and 4 git commands to generate.