1 // SPDX-License-Identifier: GPL-2.0
4 * AF_XDP sockets allows a channel between XDP programs and userspace
6 * Copyright(c) 2018 Intel Corporation.
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <net/xdp_sock.h>
27 #include "xsk_queue.h"
30 #define TX_BATCH_SIZE 16
32 static struct xdp_sock *xdp_sk(struct sock *sk)
34 return (struct xdp_sock *)sk;
37 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
42 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
44 u32 *id, len = xdp->data_end - xdp->data;
48 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
51 id = xskq_peek_id(xs->umem->fq);
55 buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
56 memcpy(buffer, xdp->data, len);
57 err = xskq_produce_batch_desc(xs->rx, *id, len,
58 xs->umem->frame_headroom);
60 xskq_discard_id(xs->umem->fq);
65 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
69 err = __xsk_rcv(xs, xdp);
78 void xsk_flush(struct xdp_sock *xs)
80 xskq_produce_flush_desc(xs->rx);
81 xs->sk.sk_data_ready(&xs->sk);
84 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
88 err = __xsk_rcv(xs, xdp);
97 static void xsk_destruct_skb(struct sk_buff *skb)
99 u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
100 struct xdp_sock *xs = xdp_sk(skb->sk);
102 WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
107 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
110 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
111 u32 max_batch = TX_BATCH_SIZE;
112 struct xdp_sock *xs = xdp_sk(sk);
113 bool sent_frame = false;
114 struct xdp_desc desc;
118 if (unlikely(!xs->tx))
123 mutex_lock(&xs->mutex);
125 while (xskq_peek_desc(xs->tx, &desc)) {
129 if (max_batch-- == 0) {
134 if (xskq_reserve_id(xs->umem->cq)) {
140 if (unlikely(len > xs->dev->mtu)) {
145 if (xs->queue_id >= xs->dev->real_num_tx_queues) {
150 skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
151 if (unlikely(!skb)) {
158 buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
159 err = skb_store_bits(skb, 0, buffer, len);
166 skb->priority = sk->sk_priority;
167 skb->mark = sk->sk_mark;
168 skb_shinfo(skb)->destructor_arg = (void *)(long)id;
169 skb->destructor = xsk_destruct_skb;
171 err = dev_direct_xmit(skb, xs->queue_id);
172 /* Ignore NET_XMIT_CN as packet might have been sent */
173 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
175 /* SKB consumed by dev_direct_xmit() */
180 xskq_discard_desc(xs->tx);
185 sk->sk_write_space(sk);
187 mutex_unlock(&xs->mutex);
191 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
193 struct sock *sk = sock->sk;
194 struct xdp_sock *xs = xdp_sk(sk);
196 if (unlikely(!xs->dev))
198 if (unlikely(!(xs->dev->flags & IFF_UP)))
201 return xsk_generic_xmit(sk, m, total_len);
204 static unsigned int xsk_poll(struct file *file, struct socket *sock,
205 struct poll_table_struct *wait)
207 unsigned int mask = datagram_poll(file, sock, wait);
208 struct sock *sk = sock->sk;
209 struct xdp_sock *xs = xdp_sk(sk);
211 if (xs->rx && !xskq_empty_desc(xs->rx))
212 mask |= POLLIN | POLLRDNORM;
213 if (xs->tx && !xskq_full_desc(xs->tx))
214 mask |= POLLOUT | POLLWRNORM;
219 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
224 if (entries == 0 || *queue || !is_power_of_2(entries))
227 q = xskq_create(entries, umem_queue);
231 /* Make sure queue is ready before it can be seen by others */
237 static int xsk_release(struct socket *sock)
239 struct sock *sk = sock->sk;
240 struct xdp_sock *xs = xdp_sk(sk);
249 sock_prot_inuse_add(net, sk->sk_prot, -1);
253 /* Wait for driver to stop using the xdp socket. */
262 sk_refcnt_debug_release(sk);
268 static struct socket *xsk_lookup_xsk_from_fd(int fd)
273 sock = sockfd_lookup(fd, &err);
275 return ERR_PTR(-ENOTSOCK);
277 if (sock->sk->sk_family != PF_XDP) {
279 return ERR_PTR(-ENOPROTOOPT);
285 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
287 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
288 struct sock *sk = sock->sk;
289 struct xdp_sock *xs = xdp_sk(sk);
290 struct net_device *dev;
293 if (addr_len < sizeof(struct sockaddr_xdp))
295 if (sxdp->sxdp_family != AF_XDP)
298 mutex_lock(&xs->mutex);
304 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
310 if (!xs->rx && !xs->tx) {
315 if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
316 (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
321 if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
322 struct xdp_sock *umem_xs;
326 /* We have already our own. */
331 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
337 umem_xs = xdp_sk(sock->sk);
338 if (!umem_xs->umem) {
339 /* No umem to inherit. */
343 } else if (umem_xs->dev != dev ||
344 umem_xs->queue_id != sxdp->sxdp_queue_id) {
350 xdp_get_umem(umem_xs->umem);
351 xs->umem = umem_xs->umem;
353 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
357 /* This xsk has its own umem. */
358 xskq_set_umem(xs->umem->fq, &xs->umem->props);
359 xskq_set_umem(xs->umem->cq, &xs->umem->props);
363 xs->queue_id = sxdp->sxdp_queue_id;
365 xskq_set_umem(xs->rx, &xs->umem->props);
366 xskq_set_umem(xs->tx, &xs->umem->props);
372 mutex_unlock(&xs->mutex);
376 static int xsk_setsockopt(struct socket *sock, int level, int optname,
377 char __user *optval, unsigned int optlen)
379 struct sock *sk = sock->sk;
380 struct xdp_sock *xs = xdp_sk(sk);
383 if (level != SOL_XDP)
390 struct xsk_queue **q;
393 if (optlen < sizeof(entries))
395 if (copy_from_user(&entries, optval, sizeof(entries)))
398 mutex_lock(&xs->mutex);
399 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
400 err = xsk_init_queue(entries, q, false);
401 mutex_unlock(&xs->mutex);
406 struct xdp_umem_reg mr;
407 struct xdp_umem *umem;
409 if (copy_from_user(&mr, optval, sizeof(mr)))
412 mutex_lock(&xs->mutex);
414 mutex_unlock(&xs->mutex);
418 umem = xdp_umem_create(&mr);
420 mutex_unlock(&xs->mutex);
421 return PTR_ERR(umem);
424 /* Make sure umem is ready before it can be seen by others */
427 mutex_unlock(&xs->mutex);
430 case XDP_UMEM_FILL_RING:
431 case XDP_UMEM_COMPLETION_RING:
433 struct xsk_queue **q;
436 if (copy_from_user(&entries, optval, sizeof(entries)))
439 mutex_lock(&xs->mutex);
441 mutex_unlock(&xs->mutex);
445 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
447 err = xsk_init_queue(entries, q, true);
448 mutex_unlock(&xs->mutex);
458 static int xsk_getsockopt(struct socket *sock, int level, int optname,
459 char __user *optval, int __user *optlen)
461 struct sock *sk = sock->sk;
462 struct xdp_sock *xs = xdp_sk(sk);
465 if (level != SOL_XDP)
468 if (get_user(len, optlen))
476 struct xdp_statistics stats;
478 if (len < sizeof(stats))
481 mutex_lock(&xs->mutex);
482 stats.rx_dropped = xs->rx_dropped;
483 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
484 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
485 mutex_unlock(&xs->mutex);
487 if (copy_to_user(optval, &stats, sizeof(stats)))
489 if (put_user(sizeof(stats), optlen))
494 case XDP_MMAP_OFFSETS:
496 struct xdp_mmap_offsets off;
498 if (len < sizeof(off))
501 off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
502 off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
503 off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
504 off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
505 off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
506 off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
508 off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
509 off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
510 off.fr.desc = offsetof(struct xdp_umem_ring, desc);
511 off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
512 off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
513 off.cr.desc = offsetof(struct xdp_umem_ring, desc);
516 if (copy_to_user(optval, &off, len))
518 if (put_user(len, optlen))
530 static int xsk_mmap(struct file *file, struct socket *sock,
531 struct vm_area_struct *vma)
533 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
534 unsigned long size = vma->vm_end - vma->vm_start;
535 struct xdp_sock *xs = xdp_sk(sock->sk);
536 struct xsk_queue *q = NULL;
537 struct xdp_umem *umem;
541 if (offset == XDP_PGOFF_RX_RING) {
542 q = READ_ONCE(xs->rx);
543 } else if (offset == XDP_PGOFF_TX_RING) {
544 q = READ_ONCE(xs->tx);
546 umem = READ_ONCE(xs->umem);
550 if (offset == XDP_UMEM_PGOFF_FILL_RING)
551 q = READ_ONCE(umem->fq);
552 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
553 q = READ_ONCE(umem->cq);
559 qpg = virt_to_head_page(q->ring);
560 if (size > (PAGE_SIZE << compound_order(qpg)))
563 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
564 return remap_pfn_range(vma, vma->vm_start, pfn,
565 size, vma->vm_page_prot);
568 static struct proto xsk_proto = {
570 .owner = THIS_MODULE,
571 .obj_size = sizeof(struct xdp_sock),
574 static const struct proto_ops xsk_proto_ops = {
576 .owner = THIS_MODULE,
577 .release = xsk_release,
579 .connect = sock_no_connect,
580 .socketpair = sock_no_socketpair,
581 .accept = sock_no_accept,
582 .getname = sock_no_getname,
584 .ioctl = sock_no_ioctl,
585 .listen = sock_no_listen,
586 .shutdown = sock_no_shutdown,
587 .setsockopt = xsk_setsockopt,
588 .getsockopt = xsk_getsockopt,
589 .sendmsg = xsk_sendmsg,
590 .recvmsg = sock_no_recvmsg,
592 .sendpage = sock_no_sendpage,
595 static void xsk_destruct(struct sock *sk)
597 struct xdp_sock *xs = xdp_sk(sk);
599 if (!sock_flag(sk, SOCK_DEAD))
602 xskq_destroy(xs->rx);
603 xskq_destroy(xs->tx);
604 xdp_put_umem(xs->umem);
606 sk_refcnt_debug_dec(sk);
609 static int xsk_create(struct net *net, struct socket *sock, int protocol,
615 if (!ns_capable(net->user_ns, CAP_NET_RAW))
617 if (sock->type != SOCK_RAW)
618 return -ESOCKTNOSUPPORT;
621 return -EPROTONOSUPPORT;
623 sock->state = SS_UNCONNECTED;
625 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
629 sock->ops = &xsk_proto_ops;
631 sock_init_data(sock, sk);
633 sk->sk_family = PF_XDP;
635 sk->sk_destruct = xsk_destruct;
636 sk_refcnt_debug_inc(sk);
639 mutex_init(&xs->mutex);
642 sock_prot_inuse_add(net, &xsk_proto, 1);
648 static const struct net_proto_family xsk_family_ops = {
650 .create = xsk_create,
651 .owner = THIS_MODULE,
654 static int __init xsk_init(void)
658 err = proto_register(&xsk_proto, 0 /* no slab */);
662 err = sock_register(&xsk_family_ops);
669 proto_unregister(&xsk_proto);
674 fs_initcall(xsk_init);