1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET4: Implementation of BSD Unix domain sockets.
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
126 /* SMP locking strategy:
127 * hash table is protected with spinlock.
128 * each socket state is protected by separate spinlock.
131 static unsigned int unix_unbound_hash(struct sock *sk)
133 unsigned long hash = (unsigned long)sk;
139 return hash & UNIX_HASH_MOD;
142 static unsigned int unix_bsd_hash(struct inode *i)
144 return i->i_ino & UNIX_HASH_MOD;
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 int addr_len, int type)
150 __wsum csum = csum_partial(sunaddr, addr_len, 0);
153 hash = (__force unsigned int)csum_fold(csum);
157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
160 static void unix_table_double_lock(struct net *net,
161 unsigned int hash1, unsigned int hash2)
163 if (hash1 == hash2) {
164 spin_lock(&net->unx.table.locks[hash1]);
171 spin_lock(&net->unx.table.locks[hash1]);
172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
175 static void unix_table_double_unlock(struct net *net,
176 unsigned int hash1, unsigned int hash2)
178 if (hash1 == hash2) {
179 spin_unlock(&net->unx.table.locks[hash1]);
183 spin_unlock(&net->unx.table.locks[hash1]);
184 spin_unlock(&net->unx.table.locks[hash2]);
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
190 UNIXCB(skb).secid = scm->secid;
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
195 scm->secid = UNIXCB(skb).secid;
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
200 return (scm->secid == UNIXCB(skb).secid);
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
213 #endif /* CONFIG_SECURITY_NETWORK */
215 #define unix_peer(sk) (unix_sk(sk)->peer)
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
219 return unix_peer(osk) == sk;
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
224 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
227 static inline int unix_recvq_full(const struct sock *sk)
229 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
234 return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 READ_ONCE(sk->sk_max_ack_backlog);
238 struct sock *unix_peer_get(struct sock *s)
246 unix_state_unlock(s);
249 EXPORT_SYMBOL_GPL(unix_peer_get);
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
254 struct unix_address *addr;
256 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
260 refcount_set(&addr->refcnt, 1);
261 addr->len = addr_len;
262 memcpy(addr->name, sunaddr, addr_len);
267 static inline void unix_release_addr(struct unix_address *addr)
269 if (refcount_dec_and_test(&addr->refcnt))
274 * Check unix socket name:
275 * - should be not zero length.
276 * - if started by not zero, should be NULL terminated (FS object)
277 * - if started by zero, it is abstract name.
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
282 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 addr_len > sizeof(*sunaddr))
286 if (sunaddr->sun_family != AF_UNIX)
292 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
294 /* This may look like an off by one error but it is a bit more
295 * subtle. 108 is the longest valid AF_UNIX path for a binding.
296 * sun_path[108] doesn't as such exist. However in kernel space
297 * we are guaranteed that it is a valid memory location in our
298 * kernel address buffer because syscall functions always pass
299 * a pointer of struct sockaddr_storage which has a bigger buffer
302 ((char *)sunaddr)[addr_len] = 0;
305 static void __unix_remove_socket(struct sock *sk)
307 sk_del_node_init(sk);
310 static void __unix_insert_socket(struct net *net, struct sock *sk)
312 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
313 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
316 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
317 struct unix_address *addr, unsigned int hash)
319 __unix_remove_socket(sk);
320 smp_store_release(&unix_sk(sk)->addr, addr);
323 __unix_insert_socket(net, sk);
326 static void unix_remove_socket(struct net *net, struct sock *sk)
328 spin_lock(&net->unx.table.locks[sk->sk_hash]);
329 __unix_remove_socket(sk);
330 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
333 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
335 spin_lock(&net->unx.table.locks[sk->sk_hash]);
336 __unix_insert_socket(net, sk);
337 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 static void unix_insert_bsd_socket(struct sock *sk)
342 spin_lock(&bsd_socket_locks[sk->sk_hash]);
343 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
344 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
347 static void unix_remove_bsd_socket(struct sock *sk)
349 if (!hlist_unhashed(&sk->sk_bind_node)) {
350 spin_lock(&bsd_socket_locks[sk->sk_hash]);
351 __sk_del_bind_node(sk);
352 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 sk_node_init(&sk->sk_bind_node);
358 static struct sock *__unix_find_socket_byname(struct net *net,
359 struct sockaddr_un *sunname,
360 int len, unsigned int hash)
364 sk_for_each(s, &net->unx.table.buckets[hash]) {
365 struct unix_sock *u = unix_sk(s);
367 if (u->addr->len == len &&
368 !memcmp(u->addr->name, sunname, len))
374 static inline struct sock *unix_find_socket_byname(struct net *net,
375 struct sockaddr_un *sunname,
376 int len, unsigned int hash)
380 spin_lock(&net->unx.table.locks[hash]);
381 s = __unix_find_socket_byname(net, sunname, len, hash);
384 spin_unlock(&net->unx.table.locks[hash]);
388 static struct sock *unix_find_socket_byinode(struct inode *i)
390 unsigned int hash = unix_bsd_hash(i);
393 spin_lock(&bsd_socket_locks[hash]);
394 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
395 struct dentry *dentry = unix_sk(s)->path.dentry;
397 if (dentry && d_backing_inode(dentry) == i) {
399 spin_unlock(&bsd_socket_locks[hash]);
403 spin_unlock(&bsd_socket_locks[hash]);
407 /* Support code for asymmetrically connected dgram sockets
409 * If a datagram socket is connected to a socket not itself connected
410 * to the first socket (eg, /dev/log), clients may only enqueue more
411 * messages if the present receive queue of the server socket is not
412 * "too large". This means there's a second writeability condition
413 * poll and sendmsg need to test. The dgram recv code will do a wake
414 * up on the peer_wait wait queue of a socket upon reception of a
415 * datagram which needs to be propagated to sleeping would-be writers
416 * since these might not have sent anything so far. This can't be
417 * accomplished via poll_wait because the lifetime of the server
418 * socket might be less than that of its clients if these break their
419 * association with it or if the server socket is closed while clients
420 * are still connected to it and there's no way to inform "a polling
421 * implementation" that it should let go of a certain wait queue
423 * In order to propagate a wake up, a wait_queue_entry_t of the client
424 * socket is enqueued on the peer_wait queue of the server socket
425 * whose wake function does a wake_up on the ordinary client socket
426 * wait queue. This connection is established whenever a write (or
427 * poll for write) hit the flow control condition and broken when the
428 * association to the server socket is dissolved or after a wake up
432 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
436 wait_queue_head_t *u_sleep;
438 u = container_of(q, struct unix_sock, peer_wake);
440 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
442 u->peer_wake.private = NULL;
444 /* relaying can only happen while the wq still exists */
445 u_sleep = sk_sleep(&u->sk);
447 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
454 struct unix_sock *u, *u_other;
458 u_other = unix_sk(other);
460 spin_lock(&u_other->peer_wait.lock);
462 if (!u->peer_wake.private) {
463 u->peer_wake.private = other;
464 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469 spin_unlock(&u_other->peer_wait.lock);
473 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
476 struct unix_sock *u, *u_other;
479 u_other = unix_sk(other);
480 spin_lock(&u_other->peer_wait.lock);
482 if (u->peer_wake.private == other) {
483 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
484 u->peer_wake.private = NULL;
487 spin_unlock(&u_other->peer_wait.lock);
490 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
493 unix_dgram_peer_wake_disconnect(sk, other);
494 wake_up_interruptible_poll(sk_sleep(sk),
501 * - unix_peer(sk) == other
502 * - association is stable
504 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
508 connected = unix_dgram_peer_wake_connect(sk, other);
510 /* If other is SOCK_DEAD, we want to make sure we signal
511 * POLLOUT, such that a subsequent write() can get a
512 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
513 * to other and its full, we will hang waiting for POLLOUT.
515 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
519 unix_dgram_peer_wake_disconnect(sk, other);
524 static int unix_writable(const struct sock *sk)
526 return sk->sk_state != TCP_LISTEN &&
527 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
530 static void unix_write_space(struct sock *sk)
532 struct socket_wq *wq;
535 if (unix_writable(sk)) {
536 wq = rcu_dereference(sk->sk_wq);
537 if (skwq_has_sleeper(wq))
538 wake_up_interruptible_sync_poll(&wq->wait,
539 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
540 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 /* When dgram socket disconnects (or changes its peer), we clear its receive
546 * queue of packets arrived from previous peer. First, it allows to do
547 * flow control based only on wmem_alloc; second, sk connected to peer
548 * may receive messages only from that peer. */
549 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
551 if (!skb_queue_empty(&sk->sk_receive_queue)) {
552 skb_queue_purge(&sk->sk_receive_queue);
553 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
555 /* If one link of bidirectional dgram pipe is disconnected,
556 * we signal error. Messages are lost. Do not make this,
557 * when peer was not connected to us.
559 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
560 WRITE_ONCE(other->sk_err, ECONNRESET);
561 sk_error_report(other);
564 other->sk_state = TCP_CLOSE;
567 static void unix_sock_destructor(struct sock *sk)
569 struct unix_sock *u = unix_sk(sk);
571 skb_queue_purge(&sk->sk_receive_queue);
573 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
574 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
575 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
576 if (!sock_flag(sk, SOCK_DEAD)) {
577 pr_info("Attempt to release alive unix socket: %p\n", sk);
582 unix_release_addr(u->addr);
584 atomic_long_dec(&unix_nr_socks);
585 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
586 #ifdef UNIX_REFCNT_DEBUG
587 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
588 atomic_long_read(&unix_nr_socks));
592 static void unix_release_sock(struct sock *sk, int embrion)
594 struct unix_sock *u = unix_sk(sk);
600 unix_remove_socket(sock_net(sk), sk);
601 unix_remove_bsd_socket(sk);
606 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
608 u->path.dentry = NULL;
610 state = sk->sk_state;
611 sk->sk_state = TCP_CLOSE;
613 skpair = unix_peer(sk);
614 unix_peer(sk) = NULL;
616 unix_state_unlock(sk);
618 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
620 kfree_skb(u->oob_skb);
625 wake_up_interruptible_all(&u->peer_wait);
627 if (skpair != NULL) {
628 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
629 unix_state_lock(skpair);
631 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
632 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
633 WRITE_ONCE(skpair->sk_err, ECONNRESET);
634 unix_state_unlock(skpair);
635 skpair->sk_state_change(skpair);
636 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
639 unix_dgram_peer_wake_disconnect(sk, skpair);
640 sock_put(skpair); /* It may now die */
643 /* Try to flush out this socket. Throw out buffers at least */
645 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
646 if (state == TCP_LISTEN)
647 unix_release_sock(skb->sk, 1);
648 /* passed fds are erased in the kfree_skb hook */
649 UNIXCB(skb).consumed = skb->len;
658 /* ---- Socket is dead now and most probably destroyed ---- */
661 * Fixme: BSD difference: In BSD all sockets connected to us get
662 * ECONNRESET and we die on the spot. In Linux we behave
663 * like files and pipes do and wait for the last
666 * Can't we simply set sock->err?
668 * What the above comment does talk about? --ANK(980817)
671 if (unix_tot_inflight)
672 unix_gc(); /* Garbage collect fds */
675 static void init_peercred(struct sock *sk)
677 const struct cred *old_cred;
680 spin_lock(&sk->sk_peer_lock);
681 old_pid = sk->sk_peer_pid;
682 old_cred = sk->sk_peer_cred;
683 sk->sk_peer_pid = get_pid(task_tgid(current));
684 sk->sk_peer_cred = get_current_cred();
685 spin_unlock(&sk->sk_peer_lock);
691 static void copy_peercred(struct sock *sk, struct sock *peersk)
693 const struct cred *old_cred;
697 spin_lock(&sk->sk_peer_lock);
698 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
700 spin_lock(&peersk->sk_peer_lock);
701 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
703 old_pid = sk->sk_peer_pid;
704 old_cred = sk->sk_peer_cred;
705 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
706 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
708 spin_unlock(&sk->sk_peer_lock);
709 spin_unlock(&peersk->sk_peer_lock);
715 static int unix_listen(struct socket *sock, int backlog)
718 struct sock *sk = sock->sk;
719 struct unix_sock *u = unix_sk(sk);
722 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
723 goto out; /* Only stream/seqpacket sockets accept */
726 goto out; /* No listens on an unbound socket */
728 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
730 if (backlog > sk->sk_max_ack_backlog)
731 wake_up_interruptible_all(&u->peer_wait);
732 sk->sk_max_ack_backlog = backlog;
733 sk->sk_state = TCP_LISTEN;
734 /* set credentials so connect can copy them */
739 unix_state_unlock(sk);
744 static int unix_release(struct socket *);
745 static int unix_bind(struct socket *, struct sockaddr *, int);
746 static int unix_stream_connect(struct socket *, struct sockaddr *,
747 int addr_len, int flags);
748 static int unix_socketpair(struct socket *, struct socket *);
749 static int unix_accept(struct socket *, struct socket *, int, bool);
750 static int unix_getname(struct socket *, struct sockaddr *, int);
751 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
752 static __poll_t unix_dgram_poll(struct file *, struct socket *,
754 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
756 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
758 static int unix_shutdown(struct socket *, int);
759 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
760 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
761 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
762 size_t size, int flags);
763 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
764 struct pipe_inode_info *, size_t size,
766 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
767 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
768 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
769 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
770 static int unix_dgram_connect(struct socket *, struct sockaddr *,
772 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
773 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
776 static int unix_set_peek_off(struct sock *sk, int val)
778 struct unix_sock *u = unix_sk(sk);
780 if (mutex_lock_interruptible(&u->iolock))
783 sk->sk_peek_off = val;
784 mutex_unlock(&u->iolock);
789 #ifdef CONFIG_PROC_FS
790 static int unix_count_nr_fds(struct sock *sk)
796 spin_lock(&sk->sk_receive_queue.lock);
797 skb = skb_peek(&sk->sk_receive_queue);
799 u = unix_sk(skb->sk);
800 nr_fds += atomic_read(&u->scm_stat.nr_fds);
801 skb = skb_peek_next(skb, &sk->sk_receive_queue);
803 spin_unlock(&sk->sk_receive_queue.lock);
808 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
810 struct sock *sk = sock->sk;
811 unsigned char s_state;
816 s_state = READ_ONCE(sk->sk_state);
819 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
820 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
821 * SOCK_DGRAM is ordinary. So, no lock is needed.
823 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
824 nr_fds = atomic_read(&u->scm_stat.nr_fds);
825 else if (s_state == TCP_LISTEN)
826 nr_fds = unix_count_nr_fds(sk);
828 seq_printf(m, "scm_fds: %u\n", nr_fds);
832 #define unix_show_fdinfo NULL
835 static const struct proto_ops unix_stream_ops = {
837 .owner = THIS_MODULE,
838 .release = unix_release,
840 .connect = unix_stream_connect,
841 .socketpair = unix_socketpair,
842 .accept = unix_accept,
843 .getname = unix_getname,
847 .compat_ioctl = unix_compat_ioctl,
849 .listen = unix_listen,
850 .shutdown = unix_shutdown,
851 .sendmsg = unix_stream_sendmsg,
852 .recvmsg = unix_stream_recvmsg,
853 .read_skb = unix_stream_read_skb,
854 .mmap = sock_no_mmap,
855 .sendpage = unix_stream_sendpage,
856 .splice_read = unix_stream_splice_read,
857 .set_peek_off = unix_set_peek_off,
858 .show_fdinfo = unix_show_fdinfo,
861 static const struct proto_ops unix_dgram_ops = {
863 .owner = THIS_MODULE,
864 .release = unix_release,
866 .connect = unix_dgram_connect,
867 .socketpair = unix_socketpair,
868 .accept = sock_no_accept,
869 .getname = unix_getname,
870 .poll = unix_dgram_poll,
873 .compat_ioctl = unix_compat_ioctl,
875 .listen = sock_no_listen,
876 .shutdown = unix_shutdown,
877 .sendmsg = unix_dgram_sendmsg,
878 .read_skb = unix_read_skb,
879 .recvmsg = unix_dgram_recvmsg,
880 .mmap = sock_no_mmap,
881 .sendpage = sock_no_sendpage,
882 .set_peek_off = unix_set_peek_off,
883 .show_fdinfo = unix_show_fdinfo,
886 static const struct proto_ops unix_seqpacket_ops = {
888 .owner = THIS_MODULE,
889 .release = unix_release,
891 .connect = unix_stream_connect,
892 .socketpair = unix_socketpair,
893 .accept = unix_accept,
894 .getname = unix_getname,
895 .poll = unix_dgram_poll,
898 .compat_ioctl = unix_compat_ioctl,
900 .listen = unix_listen,
901 .shutdown = unix_shutdown,
902 .sendmsg = unix_seqpacket_sendmsg,
903 .recvmsg = unix_seqpacket_recvmsg,
904 .mmap = sock_no_mmap,
905 .sendpage = sock_no_sendpage,
906 .set_peek_off = unix_set_peek_off,
907 .show_fdinfo = unix_show_fdinfo,
910 static void unix_close(struct sock *sk, long timeout)
912 /* Nothing to do here, unix socket does not need a ->close().
913 * This is merely for sockmap.
917 static void unix_unhash(struct sock *sk)
919 /* Nothing to do here, unix socket does not need a ->unhash().
920 * This is merely for sockmap.
924 struct proto unix_dgram_proto = {
926 .owner = THIS_MODULE,
927 .obj_size = sizeof(struct unix_sock),
929 #ifdef CONFIG_BPF_SYSCALL
930 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
934 struct proto unix_stream_proto = {
935 .name = "UNIX-STREAM",
936 .owner = THIS_MODULE,
937 .obj_size = sizeof(struct unix_sock),
939 .unhash = unix_unhash,
940 #ifdef CONFIG_BPF_SYSCALL
941 .psock_update_sk_prot = unix_stream_bpf_update_proto,
945 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
951 atomic_long_inc(&unix_nr_socks);
952 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
957 if (type == SOCK_STREAM)
958 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
959 else /*dgram and seqpacket */
960 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
967 sock_init_data(sock, sk);
969 sk->sk_hash = unix_unbound_hash(sk);
970 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
971 sk->sk_write_space = unix_write_space;
972 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
973 sk->sk_destruct = unix_sock_destructor;
975 u->path.dentry = NULL;
977 spin_lock_init(&u->lock);
978 atomic_long_set(&u->inflight, 0);
979 INIT_LIST_HEAD(&u->link);
980 mutex_init(&u->iolock); /* single task reading lock */
981 mutex_init(&u->bindlock); /* single task binding lock */
982 init_waitqueue_head(&u->peer_wait);
983 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
984 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
985 unix_insert_unbound_socket(net, sk);
987 sock_prot_inuse_add(net, sk->sk_prot, 1);
992 atomic_long_dec(&unix_nr_socks);
996 static int unix_create(struct net *net, struct socket *sock, int protocol,
1001 if (protocol && protocol != PF_UNIX)
1002 return -EPROTONOSUPPORT;
1004 sock->state = SS_UNCONNECTED;
1006 switch (sock->type) {
1008 sock->ops = &unix_stream_ops;
1011 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1015 sock->type = SOCK_DGRAM;
1018 sock->ops = &unix_dgram_ops;
1020 case SOCK_SEQPACKET:
1021 sock->ops = &unix_seqpacket_ops;
1024 return -ESOCKTNOSUPPORT;
1027 sk = unix_create1(net, sock, kern, sock->type);
1034 static int unix_release(struct socket *sock)
1036 struct sock *sk = sock->sk;
1041 sk->sk_prot->close(sk, 0);
1042 unix_release_sock(sk, 0);
1048 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1051 struct inode *inode;
1056 unix_mkname_bsd(sunaddr, addr_len);
1057 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1061 err = path_permission(&path, MAY_WRITE);
1065 err = -ECONNREFUSED;
1066 inode = d_backing_inode(path.dentry);
1067 if (!S_ISSOCK(inode->i_mode))
1070 sk = unix_find_socket_byinode(inode);
1075 if (sk->sk_type == type)
1089 return ERR_PTR(err);
1092 static struct sock *unix_find_abstract(struct net *net,
1093 struct sockaddr_un *sunaddr,
1094 int addr_len, int type)
1096 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1097 struct dentry *dentry;
1100 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1102 return ERR_PTR(-ECONNREFUSED);
1104 dentry = unix_sk(sk)->path.dentry;
1106 touch_atime(&unix_sk(sk)->path);
1111 static struct sock *unix_find_other(struct net *net,
1112 struct sockaddr_un *sunaddr,
1113 int addr_len, int type)
1117 if (sunaddr->sun_path[0])
1118 sk = unix_find_bsd(sunaddr, addr_len, type);
1120 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1125 static int unix_autobind(struct sock *sk)
1127 unsigned int new_hash, old_hash = sk->sk_hash;
1128 struct unix_sock *u = unix_sk(sk);
1129 struct net *net = sock_net(sk);
1130 struct unix_address *addr;
1131 u32 lastnum, ordernum;
1134 err = mutex_lock_interruptible(&u->bindlock);
1142 addr = kzalloc(sizeof(*addr) +
1143 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1147 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1148 addr->name->sun_family = AF_UNIX;
1149 refcount_set(&addr->refcnt, 1);
1151 ordernum = get_random_u32();
1152 lastnum = ordernum & 0xFFFFF;
1154 ordernum = (ordernum + 1) & 0xFFFFF;
1155 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1157 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1158 unix_table_double_lock(net, old_hash, new_hash);
1160 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1161 unix_table_double_unlock(net, old_hash, new_hash);
1163 /* __unix_find_socket_byname() may take long time if many names
1164 * are already in use.
1168 if (ordernum == lastnum) {
1169 /* Give up if all names seems to be in use. */
1171 unix_release_addr(addr);
1178 __unix_set_addr_hash(net, sk, addr, new_hash);
1179 unix_table_double_unlock(net, old_hash, new_hash);
1182 out: mutex_unlock(&u->bindlock);
1186 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1189 umode_t mode = S_IFSOCK |
1190 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1191 unsigned int new_hash, old_hash = sk->sk_hash;
1192 struct unix_sock *u = unix_sk(sk);
1193 struct net *net = sock_net(sk);
1194 struct mnt_idmap *idmap;
1195 struct unix_address *addr;
1196 struct dentry *dentry;
1200 unix_mkname_bsd(sunaddr, addr_len);
1201 addr_len = strlen(sunaddr->sun_path) +
1202 offsetof(struct sockaddr_un, sun_path) + 1;
1204 addr = unix_create_addr(sunaddr, addr_len);
1209 * Get the parent directory, calculate the hash for last
1212 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1213 if (IS_ERR(dentry)) {
1214 err = PTR_ERR(dentry);
1219 * All right, let's create it.
1221 idmap = mnt_idmap(parent.mnt);
1222 err = security_path_mknod(&parent, dentry, mode, 0);
1224 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1227 err = mutex_lock_interruptible(&u->bindlock);
1233 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1234 unix_table_double_lock(net, old_hash, new_hash);
1235 u->path.mnt = mntget(parent.mnt);
1236 u->path.dentry = dget(dentry);
1237 __unix_set_addr_hash(net, sk, addr, new_hash);
1238 unix_table_double_unlock(net, old_hash, new_hash);
1239 unix_insert_bsd_socket(sk);
1240 mutex_unlock(&u->bindlock);
1241 done_path_create(&parent, dentry);
1245 mutex_unlock(&u->bindlock);
1248 /* failed after successful mknod? unlink what we'd created... */
1249 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1251 done_path_create(&parent, dentry);
1253 unix_release_addr(addr);
1254 return err == -EEXIST ? -EADDRINUSE : err;
1257 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1260 unsigned int new_hash, old_hash = sk->sk_hash;
1261 struct unix_sock *u = unix_sk(sk);
1262 struct net *net = sock_net(sk);
1263 struct unix_address *addr;
1266 addr = unix_create_addr(sunaddr, addr_len);
1270 err = mutex_lock_interruptible(&u->bindlock);
1279 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1280 unix_table_double_lock(net, old_hash, new_hash);
1282 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1285 __unix_set_addr_hash(net, sk, addr, new_hash);
1286 unix_table_double_unlock(net, old_hash, new_hash);
1287 mutex_unlock(&u->bindlock);
1291 unix_table_double_unlock(net, old_hash, new_hash);
1294 mutex_unlock(&u->bindlock);
1296 unix_release_addr(addr);
1300 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1302 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1303 struct sock *sk = sock->sk;
1306 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1307 sunaddr->sun_family == AF_UNIX)
1308 return unix_autobind(sk);
1310 err = unix_validate_addr(sunaddr, addr_len);
1314 if (sunaddr->sun_path[0])
1315 err = unix_bind_bsd(sk, sunaddr, addr_len);
1317 err = unix_bind_abstract(sk, sunaddr, addr_len);
1322 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1324 if (unlikely(sk1 == sk2) || !sk2) {
1325 unix_state_lock(sk1);
1329 unix_state_lock(sk1);
1330 unix_state_lock_nested(sk2);
1332 unix_state_lock(sk2);
1333 unix_state_lock_nested(sk1);
1337 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1339 if (unlikely(sk1 == sk2) || !sk2) {
1340 unix_state_unlock(sk1);
1343 unix_state_unlock(sk1);
1344 unix_state_unlock(sk2);
1347 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1348 int alen, int flags)
1350 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1351 struct sock *sk = sock->sk;
1356 if (alen < offsetofend(struct sockaddr, sa_family))
1359 if (addr->sa_family != AF_UNSPEC) {
1360 err = unix_validate_addr(sunaddr, alen);
1364 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1365 !unix_sk(sk)->addr) {
1366 err = unix_autobind(sk);
1372 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1373 if (IS_ERR(other)) {
1374 err = PTR_ERR(other);
1378 unix_state_double_lock(sk, other);
1380 /* Apparently VFS overslept socket death. Retry. */
1381 if (sock_flag(other, SOCK_DEAD)) {
1382 unix_state_double_unlock(sk, other);
1388 if (!unix_may_send(sk, other))
1391 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1395 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1398 * 1003.1g breaking connected state with AF_UNSPEC
1401 unix_state_double_lock(sk, other);
1405 * If it was connected, reconnect.
1407 if (unix_peer(sk)) {
1408 struct sock *old_peer = unix_peer(sk);
1410 unix_peer(sk) = other;
1412 sk->sk_state = TCP_CLOSE;
1413 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1415 unix_state_double_unlock(sk, other);
1417 if (other != old_peer)
1418 unix_dgram_disconnected(sk, old_peer);
1421 unix_peer(sk) = other;
1422 unix_state_double_unlock(sk, other);
1428 unix_state_double_unlock(sk, other);
1434 static long unix_wait_for_peer(struct sock *other, long timeo)
1435 __releases(&unix_sk(other)->lock)
1437 struct unix_sock *u = unix_sk(other);
1441 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1443 sched = !sock_flag(other, SOCK_DEAD) &&
1444 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1445 unix_recvq_full_lockless(other);
1447 unix_state_unlock(other);
1450 timeo = schedule_timeout(timeo);
1452 finish_wait(&u->peer_wait, &wait);
1456 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1457 int addr_len, int flags)
1459 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1460 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1461 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1462 struct net *net = sock_net(sk);
1463 struct sk_buff *skb = NULL;
1468 err = unix_validate_addr(sunaddr, addr_len);
1472 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1473 err = unix_autobind(sk);
1478 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1480 /* First of all allocate resources.
1481 If we will make it after state is locked,
1482 we will have to recheck all again in any case.
1485 /* create new sock for complete connection */
1486 newsk = unix_create1(net, NULL, 0, sock->type);
1487 if (IS_ERR(newsk)) {
1488 err = PTR_ERR(newsk);
1495 /* Allocate skb for sending to listening sock */
1496 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1501 /* Find listening sock. */
1502 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1503 if (IS_ERR(other)) {
1504 err = PTR_ERR(other);
1509 /* Latch state of peer */
1510 unix_state_lock(other);
1512 /* Apparently VFS overslept socket death. Retry. */
1513 if (sock_flag(other, SOCK_DEAD)) {
1514 unix_state_unlock(other);
1519 err = -ECONNREFUSED;
1520 if (other->sk_state != TCP_LISTEN)
1522 if (other->sk_shutdown & RCV_SHUTDOWN)
1525 if (unix_recvq_full(other)) {
1530 timeo = unix_wait_for_peer(other, timeo);
1532 err = sock_intr_errno(timeo);
1533 if (signal_pending(current))
1541 It is tricky place. We need to grab our state lock and cannot
1542 drop lock on peer. It is dangerous because deadlock is
1543 possible. Connect to self case and simultaneous
1544 attempt to connect are eliminated by checking socket
1545 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1546 check this before attempt to grab lock.
1548 Well, and we have to recheck the state after socket locked.
1554 /* This is ok... continue with connect */
1556 case TCP_ESTABLISHED:
1557 /* Socket is already connected */
1565 unix_state_lock_nested(sk);
1567 if (sk->sk_state != st) {
1568 unix_state_unlock(sk);
1569 unix_state_unlock(other);
1574 err = security_unix_stream_connect(sk, other, newsk);
1576 unix_state_unlock(sk);
1580 /* The way is open! Fastly set all the necessary fields... */
1583 unix_peer(newsk) = sk;
1584 newsk->sk_state = TCP_ESTABLISHED;
1585 newsk->sk_type = sk->sk_type;
1586 init_peercred(newsk);
1587 newu = unix_sk(newsk);
1588 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1589 otheru = unix_sk(other);
1591 /* copy address information from listening to new sock
1593 * The contents of *(otheru->addr) and otheru->path
1594 * are seen fully set up here, since we have found
1595 * otheru in hash under its lock. Insertion into the
1596 * hash chain we'd found it in had been done in an
1597 * earlier critical area protected by the chain's lock,
1598 * the same one where we'd set *(otheru->addr) contents,
1599 * as well as otheru->path and otheru->addr itself.
1601 * Using smp_store_release() here to set newu->addr
1602 * is enough to make those stores, as well as stores
1603 * to newu->path visible to anyone who gets newu->addr
1604 * by smp_load_acquire(). IOW, the same warranties
1605 * as for unix_sock instances bound in unix_bind() or
1606 * in unix_autobind().
1608 if (otheru->path.dentry) {
1609 path_get(&otheru->path);
1610 newu->path = otheru->path;
1612 refcount_inc(&otheru->addr->refcnt);
1613 smp_store_release(&newu->addr, otheru->addr);
1615 /* Set credentials */
1616 copy_peercred(sk, other);
1618 sock->state = SS_CONNECTED;
1619 sk->sk_state = TCP_ESTABLISHED;
1622 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1623 unix_peer(sk) = newsk;
1625 unix_state_unlock(sk);
1627 /* take ten and send info to listening sock */
1628 spin_lock(&other->sk_receive_queue.lock);
1629 __skb_queue_tail(&other->sk_receive_queue, skb);
1630 spin_unlock(&other->sk_receive_queue.lock);
1631 unix_state_unlock(other);
1632 other->sk_data_ready(other);
1638 unix_state_unlock(other);
1643 unix_release_sock(newsk, 0);
1649 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1651 struct sock *ska = socka->sk, *skb = sockb->sk;
1653 /* Join our sockets back to back */
1656 unix_peer(ska) = skb;
1657 unix_peer(skb) = ska;
1661 ska->sk_state = TCP_ESTABLISHED;
1662 skb->sk_state = TCP_ESTABLISHED;
1663 socka->state = SS_CONNECTED;
1664 sockb->state = SS_CONNECTED;
1668 static void unix_sock_inherit_flags(const struct socket *old,
1671 if (test_bit(SOCK_PASSCRED, &old->flags))
1672 set_bit(SOCK_PASSCRED, &new->flags);
1673 if (test_bit(SOCK_PASSSEC, &old->flags))
1674 set_bit(SOCK_PASSSEC, &new->flags);
1677 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1680 struct sock *sk = sock->sk;
1682 struct sk_buff *skb;
1686 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1690 if (sk->sk_state != TCP_LISTEN)
1693 /* If socket state is TCP_LISTEN it cannot change (for now...),
1694 * so that no locks are necessary.
1697 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1700 /* This means receive shutdown. */
1707 skb_free_datagram(sk, skb);
1708 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1710 /* attach accepted sock to socket */
1711 unix_state_lock(tsk);
1712 newsock->state = SS_CONNECTED;
1713 unix_sock_inherit_flags(sock, newsock);
1714 sock_graft(tsk, newsock);
1715 unix_state_unlock(tsk);
1723 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1725 struct sock *sk = sock->sk;
1726 struct unix_address *addr;
1727 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1731 sk = unix_peer_get(sk);
1741 addr = smp_load_acquire(&unix_sk(sk)->addr);
1743 sunaddr->sun_family = AF_UNIX;
1744 sunaddr->sun_path[0] = 0;
1745 err = offsetof(struct sockaddr_un, sun_path);
1748 memcpy(sunaddr, addr->name, addr->len);
1755 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1757 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1760 * Garbage collection of unix sockets starts by selecting a set of
1761 * candidate sockets which have reference only from being in flight
1762 * (total_refs == inflight_refs). This condition is checked once during
1763 * the candidate collection phase, and candidates are marked as such, so
1764 * that non-candidates can later be ignored. While inflight_refs is
1765 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1766 * is an instantaneous decision.
1768 * Once a candidate, however, the socket must not be reinstalled into a
1769 * file descriptor while the garbage collection is in progress.
1771 * If the above conditions are met, then the directed graph of
1772 * candidates (*) does not change while unix_gc_lock is held.
1774 * Any operations that changes the file count through file descriptors
1775 * (dup, close, sendmsg) does not change the graph since candidates are
1776 * not installed in fds.
1778 * Dequeing a candidate via recvmsg would install it into an fd, but
1779 * that takes unix_gc_lock to decrement the inflight count, so it's
1780 * serialized with garbage collection.
1782 * MSG_PEEK is special in that it does not change the inflight count,
1783 * yet does install the socket into an fd. The following lock/unlock
1784 * pair is to ensure serialization with garbage collection. It must be
1785 * done between incrementing the file count and installing the file into
1788 * If garbage collection starts after the barrier provided by the
1789 * lock/unlock, then it will see the elevated refcount and not mark this
1790 * as a candidate. If a garbage collection is already in progress
1791 * before the file count was incremented, then the lock/unlock pair will
1792 * ensure that garbage collection is finished before progressing to
1793 * installing the fd.
1795 * (*) A -> B where B is on the queue of A or B is on the queue of C
1796 * which is on the queue of listening socket A.
1798 spin_lock(&unix_gc_lock);
1799 spin_unlock(&unix_gc_lock);
1802 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1806 UNIXCB(skb).pid = get_pid(scm->pid);
1807 UNIXCB(skb).uid = scm->creds.uid;
1808 UNIXCB(skb).gid = scm->creds.gid;
1809 UNIXCB(skb).fp = NULL;
1810 unix_get_secdata(scm, skb);
1811 if (scm->fp && send_fds)
1812 err = unix_attach_fds(scm, skb);
1814 skb->destructor = unix_destruct_scm;
1818 static bool unix_passcred_enabled(const struct socket *sock,
1819 const struct sock *other)
1821 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1822 !other->sk_socket ||
1823 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1827 * Some apps rely on write() giving SCM_CREDENTIALS
1828 * We include credentials if source or destination socket
1829 * asserted SOCK_PASSCRED.
1831 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1832 const struct sock *other)
1834 if (UNIXCB(skb).pid)
1836 if (unix_passcred_enabled(sock, other)) {
1837 UNIXCB(skb).pid = get_pid(task_tgid(current));
1838 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1842 static int maybe_init_creds(struct scm_cookie *scm,
1843 struct socket *socket,
1844 const struct sock *other)
1847 struct msghdr msg = { .msg_controllen = 0 };
1849 err = scm_send(socket, &msg, scm, false);
1853 if (unix_passcred_enabled(socket, other)) {
1854 scm->pid = get_pid(task_tgid(current));
1855 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1860 static bool unix_skb_scm_eq(struct sk_buff *skb,
1861 struct scm_cookie *scm)
1863 return UNIXCB(skb).pid == scm->pid &&
1864 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1865 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1866 unix_secdata_eq(scm, skb);
1869 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1871 struct scm_fp_list *fp = UNIXCB(skb).fp;
1872 struct unix_sock *u = unix_sk(sk);
1874 if (unlikely(fp && fp->count))
1875 atomic_add(fp->count, &u->scm_stat.nr_fds);
1878 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1880 struct scm_fp_list *fp = UNIXCB(skb).fp;
1881 struct unix_sock *u = unix_sk(sk);
1883 if (unlikely(fp && fp->count))
1884 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1888 * Send AF_UNIX data.
1891 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1894 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1895 struct sock *sk = sock->sk, *other = NULL;
1896 struct unix_sock *u = unix_sk(sk);
1897 struct scm_cookie scm;
1898 struct sk_buff *skb;
1905 err = scm_send(sock, msg, &scm, false);
1910 if (msg->msg_flags&MSG_OOB)
1913 if (msg->msg_namelen) {
1914 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1920 other = unix_peer_get(sk);
1925 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1926 err = unix_autobind(sk);
1932 if (len > sk->sk_sndbuf - 32)
1935 if (len > SKB_MAX_ALLOC) {
1936 data_len = min_t(size_t,
1937 len - SKB_MAX_ALLOC,
1938 MAX_SKB_FRAGS * PAGE_SIZE);
1939 data_len = PAGE_ALIGN(data_len);
1941 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1944 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1945 msg->msg_flags & MSG_DONTWAIT, &err,
1946 PAGE_ALLOC_COSTLY_ORDER);
1950 err = unix_scm_to_skb(&scm, skb, true);
1954 skb_put(skb, len - data_len);
1955 skb->data_len = data_len;
1957 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1961 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1966 if (sunaddr == NULL)
1969 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1971 if (IS_ERR(other)) {
1972 err = PTR_ERR(other);
1978 if (sk_filter(other, skb) < 0) {
1979 /* Toss the packet but do not return any error to the sender */
1985 unix_state_lock(other);
1988 if (!unix_may_send(sk, other))
1991 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1993 * Check with 1003.1g - what should
1996 unix_state_unlock(other);
2000 unix_state_lock(sk);
2003 if (sk->sk_type == SOCK_SEQPACKET) {
2004 /* We are here only when racing with unix_release_sock()
2005 * is clearing @other. Never change state to TCP_CLOSE
2006 * unlike SOCK_DGRAM wants.
2008 unix_state_unlock(sk);
2010 } else if (unix_peer(sk) == other) {
2011 unix_peer(sk) = NULL;
2012 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2014 sk->sk_state = TCP_CLOSE;
2015 unix_state_unlock(sk);
2017 unix_dgram_disconnected(sk, other);
2019 err = -ECONNREFUSED;
2021 unix_state_unlock(sk);
2031 if (other->sk_shutdown & RCV_SHUTDOWN)
2034 if (sk->sk_type != SOCK_SEQPACKET) {
2035 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2040 /* other == sk && unix_peer(other) != sk if
2041 * - unix_peer(sk) == NULL, destination address bound to sk
2042 * - unix_peer(sk) == sk by time of get but disconnected before lock
2045 unlikely(unix_peer(other) != sk &&
2046 unix_recvq_full_lockless(other))) {
2048 timeo = unix_wait_for_peer(other, timeo);
2050 err = sock_intr_errno(timeo);
2051 if (signal_pending(current))
2058 unix_state_unlock(other);
2059 unix_state_double_lock(sk, other);
2062 if (unix_peer(sk) != other ||
2063 unix_dgram_peer_wake_me(sk, other)) {
2071 goto restart_locked;
2075 if (unlikely(sk_locked))
2076 unix_state_unlock(sk);
2078 if (sock_flag(other, SOCK_RCVTSTAMP))
2079 __net_timestamp(skb);
2080 maybe_add_creds(skb, sock, other);
2081 scm_stat_add(other, skb);
2082 skb_queue_tail(&other->sk_receive_queue, skb);
2083 unix_state_unlock(other);
2084 other->sk_data_ready(other);
2091 unix_state_unlock(sk);
2092 unix_state_unlock(other);
2102 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2103 * bytes, and a minimum of a full page.
2105 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2107 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2108 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2109 struct scm_cookie *scm, bool fds_sent)
2111 struct unix_sock *ousk = unix_sk(other);
2112 struct sk_buff *skb;
2115 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2120 err = unix_scm_to_skb(scm, skb, !fds_sent);
2126 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2133 unix_state_lock(other);
2135 if (sock_flag(other, SOCK_DEAD) ||
2136 (other->sk_shutdown & RCV_SHUTDOWN)) {
2137 unix_state_unlock(other);
2142 maybe_add_creds(skb, sock, other);
2146 consume_skb(ousk->oob_skb);
2148 WRITE_ONCE(ousk->oob_skb, skb);
2150 scm_stat_add(other, skb);
2151 skb_queue_tail(&other->sk_receive_queue, skb);
2152 sk_send_sigurg(other);
2153 unix_state_unlock(other);
2154 other->sk_data_ready(other);
2160 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2163 struct sock *sk = sock->sk;
2164 struct sock *other = NULL;
2166 struct sk_buff *skb;
2168 struct scm_cookie scm;
2169 bool fds_sent = false;
2173 err = scm_send(sock, msg, &scm, false);
2178 if (msg->msg_flags & MSG_OOB) {
2179 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2187 if (msg->msg_namelen) {
2188 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2192 other = unix_peer(sk);
2197 if (sk->sk_shutdown & SEND_SHUTDOWN)
2200 while (sent < len) {
2203 /* Keep two messages in the pipe so it schedules better */
2204 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2206 /* allow fallback to order-0 allocations */
2207 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2209 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2211 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2213 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2214 msg->msg_flags & MSG_DONTWAIT, &err,
2215 get_order(UNIX_SKB_FRAGS_SZ));
2219 /* Only send the fds in the first buffer */
2220 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2227 skb_put(skb, size - data_len);
2228 skb->data_len = data_len;
2230 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2236 unix_state_lock(other);
2238 if (sock_flag(other, SOCK_DEAD) ||
2239 (other->sk_shutdown & RCV_SHUTDOWN))
2242 maybe_add_creds(skb, sock, other);
2243 scm_stat_add(other, skb);
2244 skb_queue_tail(&other->sk_receive_queue, skb);
2245 unix_state_unlock(other);
2246 other->sk_data_ready(other);
2250 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2251 if (msg->msg_flags & MSG_OOB) {
2252 err = queue_oob(sock, msg, other, &scm, fds_sent);
2264 unix_state_unlock(other);
2267 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2268 send_sig(SIGPIPE, current, 0);
2272 return sent ? : err;
2275 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2276 int offset, size_t size, int flags)
2279 bool send_sigpipe = false;
2280 bool init_scm = true;
2281 struct scm_cookie scm;
2282 struct sock *other, *sk = socket->sk;
2283 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2285 if (flags & MSG_OOB)
2288 other = unix_peer(sk);
2289 if (!other || sk->sk_state != TCP_ESTABLISHED)
2294 unix_state_unlock(other);
2295 mutex_unlock(&unix_sk(other)->iolock);
2296 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2302 /* we must acquire iolock as we modify already present
2303 * skbs in the sk_receive_queue and mess with skb->len
2305 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2307 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2311 if (sk->sk_shutdown & SEND_SHUTDOWN) {
2313 send_sigpipe = true;
2317 unix_state_lock(other);
2319 if (sock_flag(other, SOCK_DEAD) ||
2320 other->sk_shutdown & RCV_SHUTDOWN) {
2322 send_sigpipe = true;
2323 goto err_state_unlock;
2327 err = maybe_init_creds(&scm, socket, other);
2329 goto err_state_unlock;
2333 skb = skb_peek_tail(&other->sk_receive_queue);
2334 if (tail && tail == skb) {
2336 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2343 } else if (newskb) {
2344 /* this is fast path, we don't necessarily need to
2345 * call to kfree_skb even though with newskb == NULL
2346 * this - does no harm
2348 consume_skb(newskb);
2352 if (skb_append_pagefrags(skb, page, offset, size)) {
2358 skb->data_len += size;
2359 skb->truesize += size;
2360 refcount_add(size, &sk->sk_wmem_alloc);
2363 err = unix_scm_to_skb(&scm, skb, false);
2365 goto err_state_unlock;
2366 spin_lock(&other->sk_receive_queue.lock);
2367 __skb_queue_tail(&other->sk_receive_queue, newskb);
2368 spin_unlock(&other->sk_receive_queue.lock);
2371 unix_state_unlock(other);
2372 mutex_unlock(&unix_sk(other)->iolock);
2374 other->sk_data_ready(other);
2379 unix_state_unlock(other);
2381 mutex_unlock(&unix_sk(other)->iolock);
2384 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2385 send_sig(SIGPIPE, current, 0);
2391 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2395 struct sock *sk = sock->sk;
2397 err = sock_error(sk);
2401 if (sk->sk_state != TCP_ESTABLISHED)
2404 if (msg->msg_namelen)
2405 msg->msg_namelen = 0;
2407 return unix_dgram_sendmsg(sock, msg, len);
2410 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2411 size_t size, int flags)
2413 struct sock *sk = sock->sk;
2415 if (sk->sk_state != TCP_ESTABLISHED)
2418 return unix_dgram_recvmsg(sock, msg, size, flags);
2421 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2423 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2426 msg->msg_namelen = addr->len;
2427 memcpy(msg->msg_name, addr->name, addr->len);
2431 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2434 struct scm_cookie scm;
2435 struct socket *sock = sk->sk_socket;
2436 struct unix_sock *u = unix_sk(sk);
2437 struct sk_buff *skb, *last;
2446 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2449 mutex_lock(&u->iolock);
2451 skip = sk_peek_offset(sk, flags);
2452 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2453 &skip, &err, &last);
2455 if (!(flags & MSG_PEEK))
2456 scm_stat_del(sk, skb);
2460 mutex_unlock(&u->iolock);
2465 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2466 &err, &timeo, last));
2468 if (!skb) { /* implies iolock unlocked */
2469 unix_state_lock(sk);
2470 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2471 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2472 (sk->sk_shutdown & RCV_SHUTDOWN))
2474 unix_state_unlock(sk);
2478 if (wq_has_sleeper(&u->peer_wait))
2479 wake_up_interruptible_sync_poll(&u->peer_wait,
2480 EPOLLOUT | EPOLLWRNORM |
2484 unix_copy_addr(msg, skb->sk);
2486 if (size > skb->len - skip)
2487 size = skb->len - skip;
2488 else if (size < skb->len - skip)
2489 msg->msg_flags |= MSG_TRUNC;
2491 err = skb_copy_datagram_msg(skb, skip, msg, size);
2495 if (sock_flag(sk, SOCK_RCVTSTAMP))
2496 __sock_recv_timestamp(msg, sk, skb);
2498 memset(&scm, 0, sizeof(scm));
2500 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2501 unix_set_secdata(&scm, skb);
2503 if (!(flags & MSG_PEEK)) {
2505 unix_detach_fds(&scm, skb);
2507 sk_peek_offset_bwd(sk, skb->len);
2509 /* It is questionable: on PEEK we could:
2510 - do not return fds - good, but too simple 8)
2511 - return fds, and do not return them on read (old strategy,
2513 - clone fds (I chose it for now, it is the most universal
2516 POSIX 1003.1g does not actually define this clearly
2517 at all. POSIX 1003.1g doesn't define a lot of things
2522 sk_peek_offset_fwd(sk, size);
2525 unix_peek_fds(&scm, skb);
2527 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2529 scm_recv(sock, msg, &scm, flags);
2532 skb_free_datagram(sk, skb);
2533 mutex_unlock(&u->iolock);
2538 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2541 struct sock *sk = sock->sk;
2543 #ifdef CONFIG_BPF_SYSCALL
2544 const struct proto *prot = READ_ONCE(sk->sk_prot);
2546 if (prot != &unix_dgram_proto)
2547 return prot->recvmsg(sk, msg, size, flags, NULL);
2549 return __unix_dgram_recvmsg(sk, msg, size, flags);
2552 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2554 struct unix_sock *u = unix_sk(sk);
2555 struct sk_buff *skb;
2558 mutex_lock(&u->iolock);
2559 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2560 mutex_unlock(&u->iolock);
2564 copied = recv_actor(sk, skb);
2571 * Sleep until more data has arrived. But check for races..
2573 static long unix_stream_data_wait(struct sock *sk, long timeo,
2574 struct sk_buff *last, unsigned int last_len,
2577 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2578 struct sk_buff *tail;
2581 unix_state_lock(sk);
2584 prepare_to_wait(sk_sleep(sk), &wait, state);
2586 tail = skb_peek_tail(&sk->sk_receive_queue);
2588 (tail && tail->len != last_len) ||
2590 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2591 signal_pending(current) ||
2595 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2596 unix_state_unlock(sk);
2597 timeo = schedule_timeout(timeo);
2598 unix_state_lock(sk);
2600 if (sock_flag(sk, SOCK_DEAD))
2603 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2606 finish_wait(sk_sleep(sk), &wait);
2607 unix_state_unlock(sk);
2611 static unsigned int unix_skb_len(const struct sk_buff *skb)
2613 return skb->len - UNIXCB(skb).consumed;
2616 struct unix_stream_read_state {
2617 int (*recv_actor)(struct sk_buff *, int, int,
2618 struct unix_stream_read_state *);
2619 struct socket *socket;
2621 struct pipe_inode_info *pipe;
2624 unsigned int splice_flags;
2627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2628 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2630 struct socket *sock = state->socket;
2631 struct sock *sk = sock->sk;
2632 struct unix_sock *u = unix_sk(sk);
2634 struct sk_buff *oob_skb;
2636 mutex_lock(&u->iolock);
2637 unix_state_lock(sk);
2639 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2640 unix_state_unlock(sk);
2641 mutex_unlock(&u->iolock);
2645 oob_skb = u->oob_skb;
2647 if (!(state->flags & MSG_PEEK))
2648 WRITE_ONCE(u->oob_skb, NULL);
2650 unix_state_unlock(sk);
2652 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2654 if (!(state->flags & MSG_PEEK)) {
2655 UNIXCB(oob_skb).consumed += 1;
2659 mutex_unlock(&u->iolock);
2664 state->msg->msg_flags |= MSG_OOB;
2668 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2669 int flags, int copied)
2671 struct unix_sock *u = unix_sk(sk);
2673 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2674 skb_unlink(skb, &sk->sk_receive_queue);
2678 if (skb == u->oob_skb) {
2681 } else if (sock_flag(sk, SOCK_URGINLINE)) {
2682 if (!(flags & MSG_PEEK)) {
2683 WRITE_ONCE(u->oob_skb, NULL);
2686 } else if (!(flags & MSG_PEEK)) {
2687 skb_unlink(skb, &sk->sk_receive_queue);
2689 skb = skb_peek(&sk->sk_receive_queue);
2697 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2699 if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2702 return unix_read_skb(sk, recv_actor);
2705 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2708 struct scm_cookie scm;
2709 struct socket *sock = state->socket;
2710 struct sock *sk = sock->sk;
2711 struct unix_sock *u = unix_sk(sk);
2713 int flags = state->flags;
2714 int noblock = flags & MSG_DONTWAIT;
2715 bool check_creds = false;
2720 size_t size = state->size;
2721 unsigned int last_len;
2723 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2728 if (unlikely(flags & MSG_OOB)) {
2730 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2731 err = unix_stream_recv_urg(state);
2736 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2737 timeo = sock_rcvtimeo(sk, noblock);
2739 memset(&scm, 0, sizeof(scm));
2741 /* Lock the socket to prevent queue disordering
2742 * while sleeps in memcpy_tomsg
2744 mutex_lock(&u->iolock);
2746 skip = max(sk_peek_offset(sk, flags), 0);
2751 struct sk_buff *skb, *last;
2754 unix_state_lock(sk);
2755 if (sock_flag(sk, SOCK_DEAD)) {
2759 last = skb = skb_peek(&sk->sk_receive_queue);
2760 last_len = last ? last->len : 0;
2762 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2764 skb = manage_oob(skb, sk, flags, copied);
2766 unix_state_unlock(sk);
2775 if (copied >= target)
2779 * POSIX 1003.1g mandates this order.
2782 err = sock_error(sk);
2785 if (sk->sk_shutdown & RCV_SHUTDOWN)
2788 unix_state_unlock(sk);
2794 mutex_unlock(&u->iolock);
2796 timeo = unix_stream_data_wait(sk, timeo, last,
2797 last_len, freezable);
2799 if (signal_pending(current)) {
2800 err = sock_intr_errno(timeo);
2805 mutex_lock(&u->iolock);
2808 unix_state_unlock(sk);
2812 while (skip >= unix_skb_len(skb)) {
2813 skip -= unix_skb_len(skb);
2815 last_len = skb->len;
2816 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2821 unix_state_unlock(sk);
2824 /* Never glue messages from different writers */
2825 if (!unix_skb_scm_eq(skb, &scm))
2827 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2828 /* Copy credentials */
2829 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2830 unix_set_secdata(&scm, skb);
2834 /* Copy address just once */
2835 if (state->msg && state->msg->msg_name) {
2836 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2837 state->msg->msg_name);
2838 unix_copy_addr(state->msg, skb->sk);
2842 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2844 chunk = state->recv_actor(skb, skip, chunk, state);
2845 drop_skb = !unix_skb_len(skb);
2846 /* skb is only safe to use if !drop_skb */
2857 /* the skb was touched by a concurrent reader;
2858 * we should not expect anything from this skb
2859 * anymore and assume it invalid - we can be
2860 * sure it was dropped from the socket queue
2862 * let's report a short read
2868 /* Mark read part of skb as used */
2869 if (!(flags & MSG_PEEK)) {
2870 UNIXCB(skb).consumed += chunk;
2872 sk_peek_offset_bwd(sk, chunk);
2874 if (UNIXCB(skb).fp) {
2875 scm_stat_del(sk, skb);
2876 unix_detach_fds(&scm, skb);
2879 if (unix_skb_len(skb))
2882 skb_unlink(skb, &sk->sk_receive_queue);
2888 /* It is questionable, see note in unix_dgram_recvmsg.
2891 unix_peek_fds(&scm, skb);
2893 sk_peek_offset_fwd(sk, chunk);
2900 last_len = skb->len;
2901 unix_state_lock(sk);
2902 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2905 unix_state_unlock(sk);
2910 mutex_unlock(&u->iolock);
2912 scm_recv(sock, state->msg, &scm, flags);
2916 return copied ? : err;
2919 static int unix_stream_read_actor(struct sk_buff *skb,
2920 int skip, int chunk,
2921 struct unix_stream_read_state *state)
2925 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2927 return ret ?: chunk;
2930 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2931 size_t size, int flags)
2933 struct unix_stream_read_state state = {
2934 .recv_actor = unix_stream_read_actor,
2935 .socket = sk->sk_socket,
2941 return unix_stream_read_generic(&state, true);
2944 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2945 size_t size, int flags)
2947 struct unix_stream_read_state state = {
2948 .recv_actor = unix_stream_read_actor,
2955 #ifdef CONFIG_BPF_SYSCALL
2956 struct sock *sk = sock->sk;
2957 const struct proto *prot = READ_ONCE(sk->sk_prot);
2959 if (prot != &unix_stream_proto)
2960 return prot->recvmsg(sk, msg, size, flags, NULL);
2962 return unix_stream_read_generic(&state, true);
2965 static int unix_stream_splice_actor(struct sk_buff *skb,
2966 int skip, int chunk,
2967 struct unix_stream_read_state *state)
2969 return skb_splice_bits(skb, state->socket->sk,
2970 UNIXCB(skb).consumed + skip,
2971 state->pipe, chunk, state->splice_flags);
2974 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2975 struct pipe_inode_info *pipe,
2976 size_t size, unsigned int flags)
2978 struct unix_stream_read_state state = {
2979 .recv_actor = unix_stream_splice_actor,
2983 .splice_flags = flags,
2986 if (unlikely(*ppos))
2989 if (sock->file->f_flags & O_NONBLOCK ||
2990 flags & SPLICE_F_NONBLOCK)
2991 state.flags = MSG_DONTWAIT;
2993 return unix_stream_read_generic(&state, false);
2996 static int unix_shutdown(struct socket *sock, int mode)
2998 struct sock *sk = sock->sk;
3001 if (mode < SHUT_RD || mode > SHUT_RDWR)
3004 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
3005 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
3006 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3010 unix_state_lock(sk);
3011 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3012 other = unix_peer(sk);
3015 unix_state_unlock(sk);
3016 sk->sk_state_change(sk);
3019 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3022 const struct proto *prot = READ_ONCE(other->sk_prot);
3025 prot->unhash(other);
3026 if (mode&RCV_SHUTDOWN)
3027 peer_mode |= SEND_SHUTDOWN;
3028 if (mode&SEND_SHUTDOWN)
3029 peer_mode |= RCV_SHUTDOWN;
3030 unix_state_lock(other);
3031 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3032 unix_state_unlock(other);
3033 other->sk_state_change(other);
3034 if (peer_mode == SHUTDOWN_MASK)
3035 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3036 else if (peer_mode & RCV_SHUTDOWN)
3037 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3045 long unix_inq_len(struct sock *sk)
3047 struct sk_buff *skb;
3050 if (sk->sk_state == TCP_LISTEN)
3053 spin_lock(&sk->sk_receive_queue.lock);
3054 if (sk->sk_type == SOCK_STREAM ||
3055 sk->sk_type == SOCK_SEQPACKET) {
3056 skb_queue_walk(&sk->sk_receive_queue, skb)
3057 amount += unix_skb_len(skb);
3059 skb = skb_peek(&sk->sk_receive_queue);
3063 spin_unlock(&sk->sk_receive_queue.lock);
3067 EXPORT_SYMBOL_GPL(unix_inq_len);
3069 long unix_outq_len(struct sock *sk)
3071 return sk_wmem_alloc_get(sk);
3073 EXPORT_SYMBOL_GPL(unix_outq_len);
3075 static int unix_open_file(struct sock *sk)
3081 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3084 if (!smp_load_acquire(&unix_sk(sk)->addr))
3087 path = unix_sk(sk)->path;
3093 fd = get_unused_fd_flags(O_CLOEXEC);
3097 f = dentry_open(&path, O_PATH, current_cred());
3111 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3113 struct sock *sk = sock->sk;
3119 amount = unix_outq_len(sk);
3120 err = put_user(amount, (int __user *)arg);
3123 amount = unix_inq_len(sk);
3127 err = put_user(amount, (int __user *)arg);
3130 err = unix_open_file(sk);
3132 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3135 struct sk_buff *skb;
3138 skb = skb_peek(&sk->sk_receive_queue);
3139 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3141 err = put_user(answ, (int __user *)arg);
3152 #ifdef CONFIG_COMPAT
3153 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3155 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3159 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3161 struct sock *sk = sock->sk;
3165 sock_poll_wait(file, sock, wait);
3167 shutdown = READ_ONCE(sk->sk_shutdown);
3169 /* exceptional events? */
3170 if (READ_ONCE(sk->sk_err))
3172 if (shutdown == SHUTDOWN_MASK)
3174 if (shutdown & RCV_SHUTDOWN)
3175 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3178 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3179 mask |= EPOLLIN | EPOLLRDNORM;
3180 if (sk_is_readable(sk))
3181 mask |= EPOLLIN | EPOLLRDNORM;
3182 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3183 if (READ_ONCE(unix_sk(sk)->oob_skb))
3187 /* Connection-based need to check for termination and startup */
3188 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3189 sk->sk_state == TCP_CLOSE)
3193 * we set writable also when the other side has shut down the
3194 * connection. This prevents stuck sockets.
3196 if (unix_writable(sk))
3197 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3202 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3205 struct sock *sk = sock->sk, *other;
3206 unsigned int writable;
3210 sock_poll_wait(file, sock, wait);
3212 shutdown = READ_ONCE(sk->sk_shutdown);
3214 /* exceptional events? */
3215 if (READ_ONCE(sk->sk_err) ||
3216 !skb_queue_empty_lockless(&sk->sk_error_queue))
3218 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3220 if (shutdown & RCV_SHUTDOWN)
3221 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3222 if (shutdown == SHUTDOWN_MASK)
3226 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3227 mask |= EPOLLIN | EPOLLRDNORM;
3228 if (sk_is_readable(sk))
3229 mask |= EPOLLIN | EPOLLRDNORM;
3231 /* Connection-based need to check for termination and startup */
3232 if (sk->sk_type == SOCK_SEQPACKET) {
3233 if (sk->sk_state == TCP_CLOSE)
3235 /* connection hasn't started yet? */
3236 if (sk->sk_state == TCP_SYN_SENT)
3240 /* No write status requested, avoid expensive OUT tests. */
3241 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3244 writable = unix_writable(sk);
3246 unix_state_lock(sk);
3248 other = unix_peer(sk);
3249 if (other && unix_peer(other) != sk &&
3250 unix_recvq_full_lockless(other) &&
3251 unix_dgram_peer_wake_me(sk, other))
3254 unix_state_unlock(sk);
3258 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3260 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3265 #ifdef CONFIG_PROC_FS
3267 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3269 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3270 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3271 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3273 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3275 unsigned long offset = get_offset(*pos);
3276 unsigned long bucket = get_bucket(*pos);
3277 unsigned long count = 0;
3280 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3281 sk; sk = sk_next(sk)) {
3282 if (++count == offset)
3289 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3291 unsigned long bucket = get_bucket(*pos);
3292 struct net *net = seq_file_net(seq);
3295 while (bucket < UNIX_HASH_SIZE) {
3296 spin_lock(&net->unx.table.locks[bucket]);
3298 sk = unix_from_bucket(seq, pos);
3302 spin_unlock(&net->unx.table.locks[bucket]);
3304 *pos = set_bucket_offset(++bucket, 1);
3310 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3313 unsigned long bucket = get_bucket(*pos);
3320 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3322 *pos = set_bucket_offset(++bucket, 1);
3324 return unix_get_first(seq, pos);
3327 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3330 return SEQ_START_TOKEN;
3332 return unix_get_first(seq, pos);
3335 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3339 if (v == SEQ_START_TOKEN)
3340 return unix_get_first(seq, pos);
3342 return unix_get_next(seq, v, pos);
3345 static void unix_seq_stop(struct seq_file *seq, void *v)
3347 struct sock *sk = v;
3350 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3353 static int unix_seq_show(struct seq_file *seq, void *v)
3356 if (v == SEQ_START_TOKEN)
3357 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3361 struct unix_sock *u = unix_sk(s);
3364 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3366 refcount_read(&s->sk_refcnt),
3368 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3371 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3372 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3375 if (u->addr) { // under a hash table lock here
3380 len = u->addr->len -
3381 offsetof(struct sockaddr_un, sun_path);
3382 if (u->addr->name->sun_path[0]) {
3388 for ( ; i < len; i++)
3389 seq_putc(seq, u->addr->name->sun_path[i] ?:
3392 unix_state_unlock(s);
3393 seq_putc(seq, '\n');
3399 static const struct seq_operations unix_seq_ops = {
3400 .start = unix_seq_start,
3401 .next = unix_seq_next,
3402 .stop = unix_seq_stop,
3403 .show = unix_seq_show,
3406 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3407 struct bpf_unix_iter_state {
3408 struct seq_net_private p;
3409 unsigned int cur_sk;
3410 unsigned int end_sk;
3411 unsigned int max_sk;
3412 struct sock **batch;
3413 bool st_bucket_done;
3416 struct bpf_iter__unix {
3417 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3418 __bpf_md_ptr(struct unix_sock *, unix_sk);
3419 uid_t uid __aligned(8);
3422 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3423 struct unix_sock *unix_sk, uid_t uid)
3425 struct bpf_iter__unix ctx;
3427 meta->seq_num--; /* skip SEQ_START_TOKEN */
3429 ctx.unix_sk = unix_sk;
3431 return bpf_iter_run_prog(prog, &ctx);
3434 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3437 struct bpf_unix_iter_state *iter = seq->private;
3438 unsigned int expected = 1;
3441 sock_hold(start_sk);
3442 iter->batch[iter->end_sk++] = start_sk;
3444 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3445 if (iter->end_sk < iter->max_sk) {
3447 iter->batch[iter->end_sk++] = sk;
3453 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3458 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3460 while (iter->cur_sk < iter->end_sk)
3461 sock_put(iter->batch[iter->cur_sk++]);
3464 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3465 unsigned int new_batch_sz)
3467 struct sock **new_batch;
3469 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3470 GFP_USER | __GFP_NOWARN);
3474 bpf_iter_unix_put_batch(iter);
3475 kvfree(iter->batch);
3476 iter->batch = new_batch;
3477 iter->max_sk = new_batch_sz;
3482 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3485 struct bpf_unix_iter_state *iter = seq->private;
3486 unsigned int expected;
3487 bool resized = false;
3490 if (iter->st_bucket_done)
3491 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3494 /* Get a new batch */
3498 sk = unix_get_first(seq, pos);
3500 return NULL; /* Done */
3502 expected = bpf_iter_unix_hold_batch(seq, sk);
3504 if (iter->end_sk == expected) {
3505 iter->st_bucket_done = true;
3509 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3517 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3520 return SEQ_START_TOKEN;
3522 /* bpf iter does not support lseek, so it always
3523 * continue from where it was stop()-ped.
3525 return bpf_iter_unix_batch(seq, pos);
3528 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3530 struct bpf_unix_iter_state *iter = seq->private;
3533 /* Whenever seq_next() is called, the iter->cur_sk is
3534 * done with seq_show(), so advance to the next sk in
3537 if (iter->cur_sk < iter->end_sk)
3538 sock_put(iter->batch[iter->cur_sk++]);
3542 if (iter->cur_sk < iter->end_sk)
3543 sk = iter->batch[iter->cur_sk];
3545 sk = bpf_iter_unix_batch(seq, pos);
3550 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3552 struct bpf_iter_meta meta;
3553 struct bpf_prog *prog;
3554 struct sock *sk = v;
3559 if (v == SEQ_START_TOKEN)
3562 slow = lock_sock_fast(sk);
3564 if (unlikely(sk_unhashed(sk))) {
3569 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3571 prog = bpf_iter_get_info(&meta, false);
3572 ret = unix_prog_seq_show(prog, &meta, v, uid);
3574 unlock_sock_fast(sk, slow);
3578 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3580 struct bpf_unix_iter_state *iter = seq->private;
3581 struct bpf_iter_meta meta;
3582 struct bpf_prog *prog;
3586 prog = bpf_iter_get_info(&meta, true);
3588 (void)unix_prog_seq_show(prog, &meta, v, 0);
3591 if (iter->cur_sk < iter->end_sk)
3592 bpf_iter_unix_put_batch(iter);
3595 static const struct seq_operations bpf_iter_unix_seq_ops = {
3596 .start = bpf_iter_unix_seq_start,
3597 .next = bpf_iter_unix_seq_next,
3598 .stop = bpf_iter_unix_seq_stop,
3599 .show = bpf_iter_unix_seq_show,
3604 static const struct net_proto_family unix_family_ops = {
3606 .create = unix_create,
3607 .owner = THIS_MODULE,
3611 static int __net_init unix_net_init(struct net *net)
3615 net->unx.sysctl_max_dgram_qlen = 10;
3616 if (unix_sysctl_register(net))
3619 #ifdef CONFIG_PROC_FS
3620 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3621 sizeof(struct seq_net_private)))
3625 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3626 sizeof(spinlock_t), GFP_KERNEL);
3627 if (!net->unx.table.locks)
3630 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3631 sizeof(struct hlist_head),
3633 if (!net->unx.table.buckets)
3636 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3637 spin_lock_init(&net->unx.table.locks[i]);
3638 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3644 kvfree(net->unx.table.locks);
3646 #ifdef CONFIG_PROC_FS
3647 remove_proc_entry("unix", net->proc_net);
3650 unix_sysctl_unregister(net);
3655 static void __net_exit unix_net_exit(struct net *net)
3657 kvfree(net->unx.table.buckets);
3658 kvfree(net->unx.table.locks);
3659 unix_sysctl_unregister(net);
3660 remove_proc_entry("unix", net->proc_net);
3663 static struct pernet_operations unix_net_ops = {
3664 .init = unix_net_init,
3665 .exit = unix_net_exit,
3668 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3669 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3670 struct unix_sock *unix_sk, uid_t uid)
3672 #define INIT_BATCH_SZ 16
3674 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3676 struct bpf_unix_iter_state *iter = priv_data;
3679 err = bpf_iter_init_seq_net(priv_data, aux);
3683 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3685 bpf_iter_fini_seq_net(priv_data);
3692 static void bpf_iter_fini_unix(void *priv_data)
3694 struct bpf_unix_iter_state *iter = priv_data;
3696 bpf_iter_fini_seq_net(priv_data);
3697 kvfree(iter->batch);
3700 static const struct bpf_iter_seq_info unix_seq_info = {
3701 .seq_ops = &bpf_iter_unix_seq_ops,
3702 .init_seq_private = bpf_iter_init_unix,
3703 .fini_seq_private = bpf_iter_fini_unix,
3704 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3707 static const struct bpf_func_proto *
3708 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3709 const struct bpf_prog *prog)
3712 case BPF_FUNC_setsockopt:
3713 return &bpf_sk_setsockopt_proto;
3714 case BPF_FUNC_getsockopt:
3715 return &bpf_sk_getsockopt_proto;
3721 static struct bpf_iter_reg unix_reg_info = {
3723 .ctx_arg_info_size = 1,
3725 { offsetof(struct bpf_iter__unix, unix_sk),
3726 PTR_TO_BTF_ID_OR_NULL },
3728 .get_func_proto = bpf_iter_unix_get_func_proto,
3729 .seq_info = &unix_seq_info,
3732 static void __init bpf_iter_register(void)
3734 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3735 if (bpf_iter_reg_target(&unix_reg_info))
3736 pr_warn("Warning: could not register bpf iterator unix\n");
3740 static int __init af_unix_init(void)
3744 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3746 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3747 spin_lock_init(&bsd_socket_locks[i]);
3748 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3751 rc = proto_register(&unix_dgram_proto, 1);
3753 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3757 rc = proto_register(&unix_stream_proto, 1);
3759 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3760 proto_unregister(&unix_dgram_proto);
3764 sock_register(&unix_family_ops);
3765 register_pernet_subsys(&unix_net_ops);
3766 unix_bpf_build_proto();
3768 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3769 bpf_iter_register();
3776 static void __exit af_unix_exit(void)
3778 sock_unregister(PF_UNIX);
3779 proto_unregister(&unix_dgram_proto);
3780 proto_unregister(&unix_stream_proto);
3781 unregister_pernet_subsys(&unix_net_ops);
3784 /* Earlier than device_initcall() so that other drivers invoking
3785 request_module() don't end up in a loop when modprobe tries
3786 to use a UNIX socket. But later than subsys_initcall() because
3787 we depend on stuff initialised there */
3788 fs_initcall(af_unix_init);
3789 module_exit(af_unix_exit);
3791 MODULE_LICENSE("GPL");
3792 MODULE_ALIAS_NETPROTO(PF_UNIX);