]> Git Repo - J-linux.git/blob - net/unix/af_unix.c
Merge tag 'net-6.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
[J-linux.git] / net / unix / af_unix.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:        Implementation of BSD Unix domain sockets.
4  *
5  * Authors:     Alan Cox, <[email protected]>
6  *
7  * Fixes:
8  *              Linus Torvalds  :       Assorted bug cures.
9  *              Niibe Yutaka    :       async I/O support.
10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
11  *              Alan Cox        :       Limit size of allocated blocks.
12  *              Alan Cox        :       Fixed the stupid socketpair bug.
13  *              Alan Cox        :       BSD compatibility fine tuning.
14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
15  *              Alan Cox        :       Sorted out a proper draft version of
16  *                                      file descriptor passing hacked up from
17  *                                      Mike Shaver's work.
18  *              Marty Leisner   :       Fixes to fd passing
19  *              Nick Nevin      :       recvmsg bugfix.
20  *              Alan Cox        :       Started proper garbage collector
21  *              Heiko EiBfeldt  :       Missing verify_area check
22  *              Alan Cox        :       Started POSIXisms
23  *              Andreas Schwab  :       Replace inode by dentry for proper
24  *                                      reference counting
25  *              Kirk Petersen   :       Made this a module
26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
27  *                                      Lots of bug fixes.
28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
29  *                                      by above two patches.
30  *           Andrea Arcangeli   :       If possible we block in connect(2)
31  *                                      if the max backlog of the listen socket
32  *                                      is been reached. This won't break
33  *                                      old apps and it will avoid huge amount
34  *                                      of socks hashed (this for unix_gc()
35  *                                      performances reasons).
36  *                                      Security fix that limits the max
37  *                                      number of socks to 2*max_files and
38  *                                      the number of skb queueable in the
39  *                                      dgram receiver.
40  *              Artur Skawina   :       Hash function optimizations
41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
42  *            Malcolm Beattie   :       Set peercred for socketpair
43  *           Michal Ostrowski   :       Module initialization cleanup.
44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
45  *                                      the core infrastructure is doing that
46  *                                      for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *      [TO FIX]
51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
52  *              other the moment one end closes.
53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *      [NOT TO FIX]
56  *      accept() returns a path name even if the connecting socket has closed
57  *              in the meantime (BSD loses the path and gives up).
58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *      BSD af_unix apparently has connect forgetting to block properly.
62  *              (need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *      Bug fixes and improvements.
66  *              - client shutdown killed server socket.
67  *              - removed all useless cli/sti pairs.
68  *
69  *      Semantic changes/extensions.
70  *              - generic control message passing.
71  *              - SCM_CREDENTIALS control message.
72  *              - "Abstract" (not FS based) socket bindings.
73  *                Abstract names are sequences of bytes (not zero terminated)
74  *                started by 0, so that this name space does not intersect
75  *                with BSD names.
76  */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132         unsigned long hash = (unsigned long)sk;
133
134         hash ^= hash >> 16;
135         hash ^= hash >> 8;
136         hash ^= sk->sk_type;
137
138         return hash & UNIX_HASH_MOD;
139 }
140
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143         return i->i_ino & UNIX_HASH_MOD;
144 }
145
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147                                        int addr_len, int type)
148 {
149         __wsum csum = csum_partial(sunaddr, addr_len, 0);
150         unsigned int hash;
151
152         hash = (__force unsigned int)csum_fold(csum);
153         hash ^= hash >> 8;
154         hash ^= type;
155
156         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158
159 static void unix_table_double_lock(struct net *net,
160                                    unsigned int hash1, unsigned int hash2)
161 {
162         if (hash1 == hash2) {
163                 spin_lock(&net->unx.table.locks[hash1]);
164                 return;
165         }
166
167         if (hash1 > hash2)
168                 swap(hash1, hash2);
169
170         spin_lock(&net->unx.table.locks[hash1]);
171         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173
174 static void unix_table_double_unlock(struct net *net,
175                                      unsigned int hash1, unsigned int hash2)
176 {
177         if (hash1 == hash2) {
178                 spin_unlock(&net->unx.table.locks[hash1]);
179                 return;
180         }
181
182         spin_unlock(&net->unx.table.locks[hash1]);
183         spin_unlock(&net->unx.table.locks[hash2]);
184 }
185
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189         UNIXCB(skb).secid = scm->secid;
190 }
191
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194         scm->secid = UNIXCB(skb).secid;
195 }
196
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199         return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210         return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216         return unix_peer(osk) == sk;
217 }
218
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231         return skb_queue_len_lockless(&sk->sk_receive_queue) >
232                 READ_ONCE(sk->sk_max_ack_backlog);
233 }
234
235 struct sock *unix_peer_get(struct sock *s)
236 {
237         struct sock *peer;
238
239         unix_state_lock(s);
240         peer = unix_peer(s);
241         if (peer)
242                 sock_hold(peer);
243         unix_state_unlock(s);
244         return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249                                              int addr_len)
250 {
251         struct unix_address *addr;
252
253         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254         if (!addr)
255                 return NULL;
256
257         refcount_set(&addr->refcnt, 1);
258         addr->len = addr_len;
259         memcpy(addr->name, sunaddr, addr_len);
260
261         return addr;
262 }
263
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266         if (refcount_dec_and_test(&addr->refcnt))
267                 kfree(addr);
268 }
269
270 /*
271  *      Check unix socket name:
272  *              - should be not zero length.
273  *              - if started by not zero, should be NULL terminated (FS object)
274  *              - if started by zero, it is abstract name.
275  */
276
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280             addr_len > sizeof(*sunaddr))
281                 return -EINVAL;
282
283         if (sunaddr->sun_family != AF_UNIX)
284                 return -EINVAL;
285
286         return 0;
287 }
288
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292         short offset = offsetof(struct sockaddr_storage, __data);
293
294         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295
296         /* This may look like an off by one error but it is a bit more
297          * subtle.  108 is the longest valid AF_UNIX path for a binding.
298          * sun_path[108] doesn't as such exist.  However in kernel space
299          * we are guaranteed that it is a valid memory location in our
300          * kernel address buffer because syscall functions always pass
301          * a pointer of struct sockaddr_storage which has a bigger buffer
302          * than 108.  Also, we must terminate sun_path for strlen() in
303          * getname_kernel().
304          */
305         addr->__data[addr_len - offset] = 0;
306
307         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309          * know the actual buffer.
310          */
311         return strlen(addr->__data) + offset + 1;
312 }
313
314 static void __unix_remove_socket(struct sock *sk)
315 {
316         sk_del_node_init(sk);
317 }
318
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326                                  struct unix_address *addr, unsigned int hash)
327 {
328         __unix_remove_socket(sk);
329         smp_store_release(&unix_sk(sk)->addr, addr);
330
331         sk->sk_hash = hash;
332         __unix_insert_socket(net, sk);
333 }
334
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337         spin_lock(&net->unx.table.locks[sk->sk_hash]);
338         __unix_remove_socket(sk);
339         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344         spin_lock(&net->unx.table.locks[sk->sk_hash]);
345         __unix_insert_socket(net, sk);
346         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351         spin_lock(&bsd_socket_locks[sk->sk_hash]);
352         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358         if (!hlist_unhashed(&sk->sk_bind_node)) {
359                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
360                 __sk_del_bind_node(sk);
361                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362
363                 sk_node_init(&sk->sk_bind_node);
364         }
365 }
366
367 static struct sock *__unix_find_socket_byname(struct net *net,
368                                               struct sockaddr_un *sunname,
369                                               int len, unsigned int hash)
370 {
371         struct sock *s;
372
373         sk_for_each(s, &net->unx.table.buckets[hash]) {
374                 struct unix_sock *u = unix_sk(s);
375
376                 if (u->addr->len == len &&
377                     !memcmp(u->addr->name, sunname, len))
378                         return s;
379         }
380         return NULL;
381 }
382
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384                                                    struct sockaddr_un *sunname,
385                                                    int len, unsigned int hash)
386 {
387         struct sock *s;
388
389         spin_lock(&net->unx.table.locks[hash]);
390         s = __unix_find_socket_byname(net, sunname, len, hash);
391         if (s)
392                 sock_hold(s);
393         spin_unlock(&net->unx.table.locks[hash]);
394         return s;
395 }
396
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399         unsigned int hash = unix_bsd_hash(i);
400         struct sock *s;
401
402         spin_lock(&bsd_socket_locks[hash]);
403         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404                 struct dentry *dentry = unix_sk(s)->path.dentry;
405
406                 if (dentry && d_backing_inode(dentry) == i) {
407                         sock_hold(s);
408                         spin_unlock(&bsd_socket_locks[hash]);
409                         return s;
410                 }
411         }
412         spin_unlock(&bsd_socket_locks[hash]);
413         return NULL;
414 }
415
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442                                       void *key)
443 {
444         struct unix_sock *u;
445         wait_queue_head_t *u_sleep;
446
447         u = container_of(q, struct unix_sock, peer_wake);
448
449         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450                             q);
451         u->peer_wake.private = NULL;
452
453         /* relaying can only happen while the wq still exists */
454         u_sleep = sk_sleep(&u->sk);
455         if (u_sleep)
456                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457
458         return 0;
459 }
460
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463         struct unix_sock *u, *u_other;
464         int rc;
465
466         u = unix_sk(sk);
467         u_other = unix_sk(other);
468         rc = 0;
469         spin_lock(&u_other->peer_wait.lock);
470
471         if (!u->peer_wake.private) {
472                 u->peer_wake.private = other;
473                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474
475                 rc = 1;
476         }
477
478         spin_unlock(&u_other->peer_wait.lock);
479         return rc;
480 }
481
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483                                             struct sock *other)
484 {
485         struct unix_sock *u, *u_other;
486
487         u = unix_sk(sk);
488         u_other = unix_sk(other);
489         spin_lock(&u_other->peer_wait.lock);
490
491         if (u->peer_wake.private == other) {
492                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493                 u->peer_wake.private = NULL;
494         }
495
496         spin_unlock(&u_other->peer_wait.lock);
497 }
498
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500                                                    struct sock *other)
501 {
502         unix_dgram_peer_wake_disconnect(sk, other);
503         wake_up_interruptible_poll(sk_sleep(sk),
504                                    EPOLLOUT |
505                                    EPOLLWRNORM |
506                                    EPOLLWRBAND);
507 }
508
509 /* preconditions:
510  *      - unix_peer(sk) == other
511  *      - association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515         int connected;
516
517         connected = unix_dgram_peer_wake_connect(sk, other);
518
519         /* If other is SOCK_DEAD, we want to make sure we signal
520          * POLLOUT, such that a subsequent write() can get a
521          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522          * to other and its full, we will hang waiting for POLLOUT.
523          */
524         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525                 return 1;
526
527         if (connected)
528                 unix_dgram_peer_wake_disconnect(sk, other);
529
530         return 0;
531 }
532
533 static int unix_writable(const struct sock *sk)
534 {
535         return sk->sk_state != TCP_LISTEN &&
536                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538
539 static void unix_write_space(struct sock *sk)
540 {
541         struct socket_wq *wq;
542
543         rcu_read_lock();
544         if (unix_writable(sk)) {
545                 wq = rcu_dereference(sk->sk_wq);
546                 if (skwq_has_sleeper(wq))
547                         wake_up_interruptible_sync_poll(&wq->wait,
548                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
550         }
551         rcu_read_unlock();
552 }
553
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560         if (!skb_queue_empty(&sk->sk_receive_queue)) {
561                 skb_queue_purge(&sk->sk_receive_queue);
562                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563
564                 /* If one link of bidirectional dgram pipe is disconnected,
565                  * we signal error. Messages are lost. Do not make this,
566                  * when peer was not connected to us.
567                  */
568                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569                         WRITE_ONCE(other->sk_err, ECONNRESET);
570                         sk_error_report(other);
571                 }
572         }
573         other->sk_state = TCP_CLOSE;
574 }
575
576 static void unix_sock_destructor(struct sock *sk)
577 {
578         struct unix_sock *u = unix_sk(sk);
579
580         skb_queue_purge(&sk->sk_receive_queue);
581
582         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585         if (!sock_flag(sk, SOCK_DEAD)) {
586                 pr_info("Attempt to release alive unix socket: %p\n", sk);
587                 return;
588         }
589
590         if (u->addr)
591                 unix_release_addr(u->addr);
592
593         atomic_long_dec(&unix_nr_socks);
594         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
595 #ifdef UNIX_REFCNT_DEBUG
596         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597                 atomic_long_read(&unix_nr_socks));
598 #endif
599 }
600
601 static void unix_release_sock(struct sock *sk, int embrion)
602 {
603         struct unix_sock *u = unix_sk(sk);
604         struct sock *skpair;
605         struct sk_buff *skb;
606         struct path path;
607         int state;
608
609         unix_remove_socket(sock_net(sk), sk);
610         unix_remove_bsd_socket(sk);
611
612         /* Clear state */
613         unix_state_lock(sk);
614         sock_orphan(sk);
615         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616         path         = u->path;
617         u->path.dentry = NULL;
618         u->path.mnt = NULL;
619         state = sk->sk_state;
620         sk->sk_state = TCP_CLOSE;
621
622         skpair = unix_peer(sk);
623         unix_peer(sk) = NULL;
624
625         unix_state_unlock(sk);
626
627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628         if (u->oob_skb) {
629                 kfree_skb(u->oob_skb);
630                 u->oob_skb = NULL;
631         }
632 #endif
633
634         wake_up_interruptible_all(&u->peer_wait);
635
636         if (skpair != NULL) {
637                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
638                         unix_state_lock(skpair);
639                         /* No more writes */
640                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
642                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
643                         unix_state_unlock(skpair);
644                         skpair->sk_state_change(skpair);
645                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
646                 }
647
648                 unix_dgram_peer_wake_disconnect(sk, skpair);
649                 sock_put(skpair); /* It may now die */
650         }
651
652         /* Try to flush out this socket. Throw out buffers at least */
653
654         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
655                 if (state == TCP_LISTEN)
656                         unix_release_sock(skb->sk, 1);
657                 /* passed fds are erased in the kfree_skb hook        */
658                 UNIXCB(skb).consumed = skb->len;
659                 kfree_skb(skb);
660         }
661
662         if (path.dentry)
663                 path_put(&path);
664
665         sock_put(sk);
666
667         /* ---- Socket is dead now and most probably destroyed ---- */
668
669         /*
670          * Fixme: BSD difference: In BSD all sockets connected to us get
671          *        ECONNRESET and we die on the spot. In Linux we behave
672          *        like files and pipes do and wait for the last
673          *        dereference.
674          *
675          * Can't we simply set sock->err?
676          *
677          *        What the above comment does talk about? --ANK(980817)
678          */
679
680         if (READ_ONCE(unix_tot_inflight))
681                 unix_gc();              /* Garbage collect fds */
682 }
683
684 static void init_peercred(struct sock *sk)
685 {
686         const struct cred *old_cred;
687         struct pid *old_pid;
688
689         spin_lock(&sk->sk_peer_lock);
690         old_pid = sk->sk_peer_pid;
691         old_cred = sk->sk_peer_cred;
692         sk->sk_peer_pid  = get_pid(task_tgid(current));
693         sk->sk_peer_cred = get_current_cred();
694         spin_unlock(&sk->sk_peer_lock);
695
696         put_pid(old_pid);
697         put_cred(old_cred);
698 }
699
700 static void copy_peercred(struct sock *sk, struct sock *peersk)
701 {
702         const struct cred *old_cred;
703         struct pid *old_pid;
704
705         if (sk < peersk) {
706                 spin_lock(&sk->sk_peer_lock);
707                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708         } else {
709                 spin_lock(&peersk->sk_peer_lock);
710                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711         }
712         old_pid = sk->sk_peer_pid;
713         old_cred = sk->sk_peer_cred;
714         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
715         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
716
717         spin_unlock(&sk->sk_peer_lock);
718         spin_unlock(&peersk->sk_peer_lock);
719
720         put_pid(old_pid);
721         put_cred(old_cred);
722 }
723
724 static int unix_listen(struct socket *sock, int backlog)
725 {
726         int err;
727         struct sock *sk = sock->sk;
728         struct unix_sock *u = unix_sk(sk);
729
730         err = -EOPNOTSUPP;
731         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732                 goto out;       /* Only stream/seqpacket sockets accept */
733         err = -EINVAL;
734         if (!u->addr)
735                 goto out;       /* No listens on an unbound socket */
736         unix_state_lock(sk);
737         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738                 goto out_unlock;
739         if (backlog > sk->sk_max_ack_backlog)
740                 wake_up_interruptible_all(&u->peer_wait);
741         sk->sk_max_ack_backlog  = backlog;
742         sk->sk_state            = TCP_LISTEN;
743         /* set credentials so connect can copy them */
744         init_peercred(sk);
745         err = 0;
746
747 out_unlock:
748         unix_state_unlock(sk);
749 out:
750         return err;
751 }
752
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756                                int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762                                     poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771                                        struct pipe_inode_info *, size_t size,
772                                        unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778                               int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781                                   int);
782
783 #ifdef CONFIG_PROC_FS
784 static int unix_count_nr_fds(struct sock *sk)
785 {
786         struct sk_buff *skb;
787         struct unix_sock *u;
788         int nr_fds = 0;
789
790         spin_lock(&sk->sk_receive_queue.lock);
791         skb = skb_peek(&sk->sk_receive_queue);
792         while (skb) {
793                 u = unix_sk(skb->sk);
794                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
795                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
796         }
797         spin_unlock(&sk->sk_receive_queue.lock);
798
799         return nr_fds;
800 }
801
802 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
803 {
804         struct sock *sk = sock->sk;
805         unsigned char s_state;
806         struct unix_sock *u;
807         int nr_fds = 0;
808
809         if (sk) {
810                 s_state = READ_ONCE(sk->sk_state);
811                 u = unix_sk(sk);
812
813                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
814                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815                  * SOCK_DGRAM is ordinary. So, no lock is needed.
816                  */
817                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
818                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
819                 else if (s_state == TCP_LISTEN)
820                         nr_fds = unix_count_nr_fds(sk);
821
822                 seq_printf(m, "scm_fds: %u\n", nr_fds);
823         }
824 }
825 #else
826 #define unix_show_fdinfo NULL
827 #endif
828
829 static const struct proto_ops unix_stream_ops = {
830         .family =       PF_UNIX,
831         .owner =        THIS_MODULE,
832         .release =      unix_release,
833         .bind =         unix_bind,
834         .connect =      unix_stream_connect,
835         .socketpair =   unix_socketpair,
836         .accept =       unix_accept,
837         .getname =      unix_getname,
838         .poll =         unix_poll,
839         .ioctl =        unix_ioctl,
840 #ifdef CONFIG_COMPAT
841         .compat_ioctl = unix_compat_ioctl,
842 #endif
843         .listen =       unix_listen,
844         .shutdown =     unix_shutdown,
845         .sendmsg =      unix_stream_sendmsg,
846         .recvmsg =      unix_stream_recvmsg,
847         .read_skb =     unix_stream_read_skb,
848         .mmap =         sock_no_mmap,
849         .splice_read =  unix_stream_splice_read,
850         .set_peek_off = sk_set_peek_off,
851         .show_fdinfo =  unix_show_fdinfo,
852 };
853
854 static const struct proto_ops unix_dgram_ops = {
855         .family =       PF_UNIX,
856         .owner =        THIS_MODULE,
857         .release =      unix_release,
858         .bind =         unix_bind,
859         .connect =      unix_dgram_connect,
860         .socketpair =   unix_socketpair,
861         .accept =       sock_no_accept,
862         .getname =      unix_getname,
863         .poll =         unix_dgram_poll,
864         .ioctl =        unix_ioctl,
865 #ifdef CONFIG_COMPAT
866         .compat_ioctl = unix_compat_ioctl,
867 #endif
868         .listen =       sock_no_listen,
869         .shutdown =     unix_shutdown,
870         .sendmsg =      unix_dgram_sendmsg,
871         .read_skb =     unix_read_skb,
872         .recvmsg =      unix_dgram_recvmsg,
873         .mmap =         sock_no_mmap,
874         .set_peek_off = sk_set_peek_off,
875         .show_fdinfo =  unix_show_fdinfo,
876 };
877
878 static const struct proto_ops unix_seqpacket_ops = {
879         .family =       PF_UNIX,
880         .owner =        THIS_MODULE,
881         .release =      unix_release,
882         .bind =         unix_bind,
883         .connect =      unix_stream_connect,
884         .socketpair =   unix_socketpair,
885         .accept =       unix_accept,
886         .getname =      unix_getname,
887         .poll =         unix_dgram_poll,
888         .ioctl =        unix_ioctl,
889 #ifdef CONFIG_COMPAT
890         .compat_ioctl = unix_compat_ioctl,
891 #endif
892         .listen =       unix_listen,
893         .shutdown =     unix_shutdown,
894         .sendmsg =      unix_seqpacket_sendmsg,
895         .recvmsg =      unix_seqpacket_recvmsg,
896         .mmap =         sock_no_mmap,
897         .set_peek_off = sk_set_peek_off,
898         .show_fdinfo =  unix_show_fdinfo,
899 };
900
901 static void unix_close(struct sock *sk, long timeout)
902 {
903         /* Nothing to do here, unix socket does not need a ->close().
904          * This is merely for sockmap.
905          */
906 }
907
908 static void unix_unhash(struct sock *sk)
909 {
910         /* Nothing to do here, unix socket does not need a ->unhash().
911          * This is merely for sockmap.
912          */
913 }
914
915 static bool unix_bpf_bypass_getsockopt(int level, int optname)
916 {
917         if (level == SOL_SOCKET) {
918                 switch (optname) {
919                 case SO_PEERPIDFD:
920                         return true;
921                 default:
922                         return false;
923                 }
924         }
925
926         return false;
927 }
928
929 struct proto unix_dgram_proto = {
930         .name                   = "UNIX",
931         .owner                  = THIS_MODULE,
932         .obj_size               = sizeof(struct unix_sock),
933         .close                  = unix_close,
934         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
935 #ifdef CONFIG_BPF_SYSCALL
936         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
937 #endif
938 };
939
940 struct proto unix_stream_proto = {
941         .name                   = "UNIX-STREAM",
942         .owner                  = THIS_MODULE,
943         .obj_size               = sizeof(struct unix_sock),
944         .close                  = unix_close,
945         .unhash                 = unix_unhash,
946         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
947 #ifdef CONFIG_BPF_SYSCALL
948         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
949 #endif
950 };
951
952 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
953 {
954         struct unix_sock *u;
955         struct sock *sk;
956         int err;
957
958         atomic_long_inc(&unix_nr_socks);
959         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
960                 err = -ENFILE;
961                 goto err;
962         }
963
964         if (type == SOCK_STREAM)
965                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
966         else /*dgram and  seqpacket */
967                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
968
969         if (!sk) {
970                 err = -ENOMEM;
971                 goto err;
972         }
973
974         sock_init_data(sock, sk);
975
976         sk->sk_hash             = unix_unbound_hash(sk);
977         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
978         sk->sk_write_space      = unix_write_space;
979         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
980         sk->sk_destruct         = unix_sock_destructor;
981         u = unix_sk(sk);
982         u->listener = NULL;
983         u->vertex = NULL;
984         u->path.dentry = NULL;
985         u->path.mnt = NULL;
986         spin_lock_init(&u->lock);
987         mutex_init(&u->iolock); /* single task reading lock */
988         mutex_init(&u->bindlock); /* single task binding lock */
989         init_waitqueue_head(&u->peer_wait);
990         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
991         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
992         unix_insert_unbound_socket(net, sk);
993
994         sock_prot_inuse_add(net, sk->sk_prot, 1);
995
996         return sk;
997
998 err:
999         atomic_long_dec(&unix_nr_socks);
1000         return ERR_PTR(err);
1001 }
1002
1003 static int unix_create(struct net *net, struct socket *sock, int protocol,
1004                        int kern)
1005 {
1006         struct sock *sk;
1007
1008         if (protocol && protocol != PF_UNIX)
1009                 return -EPROTONOSUPPORT;
1010
1011         sock->state = SS_UNCONNECTED;
1012
1013         switch (sock->type) {
1014         case SOCK_STREAM:
1015                 sock->ops = &unix_stream_ops;
1016                 break;
1017                 /*
1018                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019                  *      nothing uses it.
1020                  */
1021         case SOCK_RAW:
1022                 sock->type = SOCK_DGRAM;
1023                 fallthrough;
1024         case SOCK_DGRAM:
1025                 sock->ops = &unix_dgram_ops;
1026                 break;
1027         case SOCK_SEQPACKET:
1028                 sock->ops = &unix_seqpacket_ops;
1029                 break;
1030         default:
1031                 return -ESOCKTNOSUPPORT;
1032         }
1033
1034         sk = unix_create1(net, sock, kern, sock->type);
1035         if (IS_ERR(sk))
1036                 return PTR_ERR(sk);
1037
1038         return 0;
1039 }
1040
1041 static int unix_release(struct socket *sock)
1042 {
1043         struct sock *sk = sock->sk;
1044
1045         if (!sk)
1046                 return 0;
1047
1048         sk->sk_prot->close(sk, 0);
1049         unix_release_sock(sk, 0);
1050         sock->sk = NULL;
1051
1052         return 0;
1053 }
1054
1055 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1056                                   int type)
1057 {
1058         struct inode *inode;
1059         struct path path;
1060         struct sock *sk;
1061         int err;
1062
1063         unix_mkname_bsd(sunaddr, addr_len);
1064         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065         if (err)
1066                 goto fail;
1067
1068         err = path_permission(&path, MAY_WRITE);
1069         if (err)
1070                 goto path_put;
1071
1072         err = -ECONNREFUSED;
1073         inode = d_backing_inode(path.dentry);
1074         if (!S_ISSOCK(inode->i_mode))
1075                 goto path_put;
1076
1077         sk = unix_find_socket_byinode(inode);
1078         if (!sk)
1079                 goto path_put;
1080
1081         err = -EPROTOTYPE;
1082         if (sk->sk_type == type)
1083                 touch_atime(&path);
1084         else
1085                 goto sock_put;
1086
1087         path_put(&path);
1088
1089         return sk;
1090
1091 sock_put:
1092         sock_put(sk);
1093 path_put:
1094         path_put(&path);
1095 fail:
1096         return ERR_PTR(err);
1097 }
1098
1099 static struct sock *unix_find_abstract(struct net *net,
1100                                        struct sockaddr_un *sunaddr,
1101                                        int addr_len, int type)
1102 {
1103         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104         struct dentry *dentry;
1105         struct sock *sk;
1106
1107         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1108         if (!sk)
1109                 return ERR_PTR(-ECONNREFUSED);
1110
1111         dentry = unix_sk(sk)->path.dentry;
1112         if (dentry)
1113                 touch_atime(&unix_sk(sk)->path);
1114
1115         return sk;
1116 }
1117
1118 static struct sock *unix_find_other(struct net *net,
1119                                     struct sockaddr_un *sunaddr,
1120                                     int addr_len, int type)
1121 {
1122         struct sock *sk;
1123
1124         if (sunaddr->sun_path[0])
1125                 sk = unix_find_bsd(sunaddr, addr_len, type);
1126         else
1127                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128
1129         return sk;
1130 }
1131
1132 static int unix_autobind(struct sock *sk)
1133 {
1134         unsigned int new_hash, old_hash = sk->sk_hash;
1135         struct unix_sock *u = unix_sk(sk);
1136         struct net *net = sock_net(sk);
1137         struct unix_address *addr;
1138         u32 lastnum, ordernum;
1139         int err;
1140
1141         err = mutex_lock_interruptible(&u->bindlock);
1142         if (err)
1143                 return err;
1144
1145         if (u->addr)
1146                 goto out;
1147
1148         err = -ENOMEM;
1149         addr = kzalloc(sizeof(*addr) +
1150                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1151         if (!addr)
1152                 goto out;
1153
1154         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1155         addr->name->sun_family = AF_UNIX;
1156         refcount_set(&addr->refcnt, 1);
1157
1158         ordernum = get_random_u32();
1159         lastnum = ordernum & 0xFFFFF;
1160 retry:
1161         ordernum = (ordernum + 1) & 0xFFFFF;
1162         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1163
1164         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1165         unix_table_double_lock(net, old_hash, new_hash);
1166
1167         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1168                 unix_table_double_unlock(net, old_hash, new_hash);
1169
1170                 /* __unix_find_socket_byname() may take long time if many names
1171                  * are already in use.
1172                  */
1173                 cond_resched();
1174
1175                 if (ordernum == lastnum) {
1176                         /* Give up if all names seems to be in use. */
1177                         err = -ENOSPC;
1178                         unix_release_addr(addr);
1179                         goto out;
1180                 }
1181
1182                 goto retry;
1183         }
1184
1185         __unix_set_addr_hash(net, sk, addr, new_hash);
1186         unix_table_double_unlock(net, old_hash, new_hash);
1187         err = 0;
1188
1189 out:    mutex_unlock(&u->bindlock);
1190         return err;
1191 }
1192
1193 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1194                          int addr_len)
1195 {
1196         umode_t mode = S_IFSOCK |
1197                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1198         unsigned int new_hash, old_hash = sk->sk_hash;
1199         struct unix_sock *u = unix_sk(sk);
1200         struct net *net = sock_net(sk);
1201         struct mnt_idmap *idmap;
1202         struct unix_address *addr;
1203         struct dentry *dentry;
1204         struct path parent;
1205         int err;
1206
1207         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1208         addr = unix_create_addr(sunaddr, addr_len);
1209         if (!addr)
1210                 return -ENOMEM;
1211
1212         /*
1213          * Get the parent directory, calculate the hash for last
1214          * component.
1215          */
1216         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1217         if (IS_ERR(dentry)) {
1218                 err = PTR_ERR(dentry);
1219                 goto out;
1220         }
1221
1222         /*
1223          * All right, let's create it.
1224          */
1225         idmap = mnt_idmap(parent.mnt);
1226         err = security_path_mknod(&parent, dentry, mode, 0);
1227         if (!err)
1228                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1229         if (err)
1230                 goto out_path;
1231         err = mutex_lock_interruptible(&u->bindlock);
1232         if (err)
1233                 goto out_unlink;
1234         if (u->addr)
1235                 goto out_unlock;
1236
1237         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1238         unix_table_double_lock(net, old_hash, new_hash);
1239         u->path.mnt = mntget(parent.mnt);
1240         u->path.dentry = dget(dentry);
1241         __unix_set_addr_hash(net, sk, addr, new_hash);
1242         unix_table_double_unlock(net, old_hash, new_hash);
1243         unix_insert_bsd_socket(sk);
1244         mutex_unlock(&u->bindlock);
1245         done_path_create(&parent, dentry);
1246         return 0;
1247
1248 out_unlock:
1249         mutex_unlock(&u->bindlock);
1250         err = -EINVAL;
1251 out_unlink:
1252         /* failed after successful mknod?  unlink what we'd created... */
1253         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1254 out_path:
1255         done_path_create(&parent, dentry);
1256 out:
1257         unix_release_addr(addr);
1258         return err == -EEXIST ? -EADDRINUSE : err;
1259 }
1260
1261 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1262                               int addr_len)
1263 {
1264         unsigned int new_hash, old_hash = sk->sk_hash;
1265         struct unix_sock *u = unix_sk(sk);
1266         struct net *net = sock_net(sk);
1267         struct unix_address *addr;
1268         int err;
1269
1270         addr = unix_create_addr(sunaddr, addr_len);
1271         if (!addr)
1272                 return -ENOMEM;
1273
1274         err = mutex_lock_interruptible(&u->bindlock);
1275         if (err)
1276                 goto out;
1277
1278         if (u->addr) {
1279                 err = -EINVAL;
1280                 goto out_mutex;
1281         }
1282
1283         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1284         unix_table_double_lock(net, old_hash, new_hash);
1285
1286         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1287                 goto out_spin;
1288
1289         __unix_set_addr_hash(net, sk, addr, new_hash);
1290         unix_table_double_unlock(net, old_hash, new_hash);
1291         mutex_unlock(&u->bindlock);
1292         return 0;
1293
1294 out_spin:
1295         unix_table_double_unlock(net, old_hash, new_hash);
1296         err = -EADDRINUSE;
1297 out_mutex:
1298         mutex_unlock(&u->bindlock);
1299 out:
1300         unix_release_addr(addr);
1301         return err;
1302 }
1303
1304 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1305 {
1306         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1307         struct sock *sk = sock->sk;
1308         int err;
1309
1310         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1311             sunaddr->sun_family == AF_UNIX)
1312                 return unix_autobind(sk);
1313
1314         err = unix_validate_addr(sunaddr, addr_len);
1315         if (err)
1316                 return err;
1317
1318         if (sunaddr->sun_path[0])
1319                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1320         else
1321                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1322
1323         return err;
1324 }
1325
1326 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1327 {
1328         if (unlikely(sk1 == sk2) || !sk2) {
1329                 unix_state_lock(sk1);
1330                 return;
1331         }
1332         if (sk1 > sk2)
1333                 swap(sk1, sk2);
1334
1335         unix_state_lock(sk1);
1336         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1337 }
1338
1339 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1340 {
1341         if (unlikely(sk1 == sk2) || !sk2) {
1342                 unix_state_unlock(sk1);
1343                 return;
1344         }
1345         unix_state_unlock(sk1);
1346         unix_state_unlock(sk2);
1347 }
1348
1349 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1350                               int alen, int flags)
1351 {
1352         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1353         struct sock *sk = sock->sk;
1354         struct sock *other;
1355         int err;
1356
1357         err = -EINVAL;
1358         if (alen < offsetofend(struct sockaddr, sa_family))
1359                 goto out;
1360
1361         if (addr->sa_family != AF_UNSPEC) {
1362                 err = unix_validate_addr(sunaddr, alen);
1363                 if (err)
1364                         goto out;
1365
1366                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1367                 if (err)
1368                         goto out;
1369
1370                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1371                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1372                     !unix_sk(sk)->addr) {
1373                         err = unix_autobind(sk);
1374                         if (err)
1375                                 goto out;
1376                 }
1377
1378 restart:
1379                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1380                 if (IS_ERR(other)) {
1381                         err = PTR_ERR(other);
1382                         goto out;
1383                 }
1384
1385                 unix_state_double_lock(sk, other);
1386
1387                 /* Apparently VFS overslept socket death. Retry. */
1388                 if (sock_flag(other, SOCK_DEAD)) {
1389                         unix_state_double_unlock(sk, other);
1390                         sock_put(other);
1391                         goto restart;
1392                 }
1393
1394                 err = -EPERM;
1395                 if (!unix_may_send(sk, other))
1396                         goto out_unlock;
1397
1398                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1399                 if (err)
1400                         goto out_unlock;
1401
1402                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1403         } else {
1404                 /*
1405                  *      1003.1g breaking connected state with AF_UNSPEC
1406                  */
1407                 other = NULL;
1408                 unix_state_double_lock(sk, other);
1409         }
1410
1411         /*
1412          * If it was connected, reconnect.
1413          */
1414         if (unix_peer(sk)) {
1415                 struct sock *old_peer = unix_peer(sk);
1416
1417                 unix_peer(sk) = other;
1418                 if (!other)
1419                         sk->sk_state = TCP_CLOSE;
1420                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1421
1422                 unix_state_double_unlock(sk, other);
1423
1424                 if (other != old_peer)
1425                         unix_dgram_disconnected(sk, old_peer);
1426                 sock_put(old_peer);
1427         } else {
1428                 unix_peer(sk) = other;
1429                 unix_state_double_unlock(sk, other);
1430         }
1431
1432         return 0;
1433
1434 out_unlock:
1435         unix_state_double_unlock(sk, other);
1436         sock_put(other);
1437 out:
1438         return err;
1439 }
1440
1441 static long unix_wait_for_peer(struct sock *other, long timeo)
1442         __releases(&unix_sk(other)->lock)
1443 {
1444         struct unix_sock *u = unix_sk(other);
1445         int sched;
1446         DEFINE_WAIT(wait);
1447
1448         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1449
1450         sched = !sock_flag(other, SOCK_DEAD) &&
1451                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1452                 unix_recvq_full_lockless(other);
1453
1454         unix_state_unlock(other);
1455
1456         if (sched)
1457                 timeo = schedule_timeout(timeo);
1458
1459         finish_wait(&u->peer_wait, &wait);
1460         return timeo;
1461 }
1462
1463 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1464                                int addr_len, int flags)
1465 {
1466         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1467         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1468         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1469         struct net *net = sock_net(sk);
1470         struct sk_buff *skb = NULL;
1471         long timeo;
1472         int err;
1473         int st;
1474
1475         err = unix_validate_addr(sunaddr, addr_len);
1476         if (err)
1477                 goto out;
1478
1479         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1480         if (err)
1481                 goto out;
1482
1483         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1484              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1485                 err = unix_autobind(sk);
1486                 if (err)
1487                         goto out;
1488         }
1489
1490         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1491
1492         /* First of all allocate resources.
1493            If we will make it after state is locked,
1494            we will have to recheck all again in any case.
1495          */
1496
1497         /* create new sock for complete connection */
1498         newsk = unix_create1(net, NULL, 0, sock->type);
1499         if (IS_ERR(newsk)) {
1500                 err = PTR_ERR(newsk);
1501                 newsk = NULL;
1502                 goto out;
1503         }
1504
1505         err = -ENOMEM;
1506
1507         /* Allocate skb for sending to listening sock */
1508         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1509         if (skb == NULL)
1510                 goto out;
1511
1512 restart:
1513         /*  Find listening sock. */
1514         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1515         if (IS_ERR(other)) {
1516                 err = PTR_ERR(other);
1517                 other = NULL;
1518                 goto out;
1519         }
1520
1521         /* Latch state of peer */
1522         unix_state_lock(other);
1523
1524         /* Apparently VFS overslept socket death. Retry. */
1525         if (sock_flag(other, SOCK_DEAD)) {
1526                 unix_state_unlock(other);
1527                 sock_put(other);
1528                 goto restart;
1529         }
1530
1531         err = -ECONNREFUSED;
1532         if (other->sk_state != TCP_LISTEN)
1533                 goto out_unlock;
1534         if (other->sk_shutdown & RCV_SHUTDOWN)
1535                 goto out_unlock;
1536
1537         if (unix_recvq_full(other)) {
1538                 err = -EAGAIN;
1539                 if (!timeo)
1540                         goto out_unlock;
1541
1542                 timeo = unix_wait_for_peer(other, timeo);
1543
1544                 err = sock_intr_errno(timeo);
1545                 if (signal_pending(current))
1546                         goto out;
1547                 sock_put(other);
1548                 goto restart;
1549         }
1550
1551         /* Latch our state.
1552
1553            It is tricky place. We need to grab our state lock and cannot
1554            drop lock on peer. It is dangerous because deadlock is
1555            possible. Connect to self case and simultaneous
1556            attempt to connect are eliminated by checking socket
1557            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1558            check this before attempt to grab lock.
1559
1560            Well, and we have to recheck the state after socket locked.
1561          */
1562         st = sk->sk_state;
1563
1564         switch (st) {
1565         case TCP_CLOSE:
1566                 /* This is ok... continue with connect */
1567                 break;
1568         case TCP_ESTABLISHED:
1569                 /* Socket is already connected */
1570                 err = -EISCONN;
1571                 goto out_unlock;
1572         default:
1573                 err = -EINVAL;
1574                 goto out_unlock;
1575         }
1576
1577         unix_state_lock_nested(sk, U_LOCK_SECOND);
1578
1579         if (sk->sk_state != st) {
1580                 unix_state_unlock(sk);
1581                 unix_state_unlock(other);
1582                 sock_put(other);
1583                 goto restart;
1584         }
1585
1586         err = security_unix_stream_connect(sk, other, newsk);
1587         if (err) {
1588                 unix_state_unlock(sk);
1589                 goto out_unlock;
1590         }
1591
1592         /* The way is open! Fastly set all the necessary fields... */
1593
1594         sock_hold(sk);
1595         unix_peer(newsk)        = sk;
1596         newsk->sk_state         = TCP_ESTABLISHED;
1597         newsk->sk_type          = sk->sk_type;
1598         init_peercred(newsk);
1599         newu = unix_sk(newsk);
1600         newu->listener = other;
1601         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1602         otheru = unix_sk(other);
1603
1604         /* copy address information from listening to new sock
1605          *
1606          * The contents of *(otheru->addr) and otheru->path
1607          * are seen fully set up here, since we have found
1608          * otheru in hash under its lock.  Insertion into the
1609          * hash chain we'd found it in had been done in an
1610          * earlier critical area protected by the chain's lock,
1611          * the same one where we'd set *(otheru->addr) contents,
1612          * as well as otheru->path and otheru->addr itself.
1613          *
1614          * Using smp_store_release() here to set newu->addr
1615          * is enough to make those stores, as well as stores
1616          * to newu->path visible to anyone who gets newu->addr
1617          * by smp_load_acquire().  IOW, the same warranties
1618          * as for unix_sock instances bound in unix_bind() or
1619          * in unix_autobind().
1620          */
1621         if (otheru->path.dentry) {
1622                 path_get(&otheru->path);
1623                 newu->path = otheru->path;
1624         }
1625         refcount_inc(&otheru->addr->refcnt);
1626         smp_store_release(&newu->addr, otheru->addr);
1627
1628         /* Set credentials */
1629         copy_peercred(sk, other);
1630
1631         sock->state     = SS_CONNECTED;
1632         sk->sk_state    = TCP_ESTABLISHED;
1633         sock_hold(newsk);
1634
1635         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1636         unix_peer(sk)   = newsk;
1637
1638         unix_state_unlock(sk);
1639
1640         /* take ten and send info to listening sock */
1641         spin_lock(&other->sk_receive_queue.lock);
1642         __skb_queue_tail(&other->sk_receive_queue, skb);
1643         spin_unlock(&other->sk_receive_queue.lock);
1644         unix_state_unlock(other);
1645         other->sk_data_ready(other);
1646         sock_put(other);
1647         return 0;
1648
1649 out_unlock:
1650         if (other)
1651                 unix_state_unlock(other);
1652
1653 out:
1654         kfree_skb(skb);
1655         if (newsk)
1656                 unix_release_sock(newsk, 0);
1657         if (other)
1658                 sock_put(other);
1659         return err;
1660 }
1661
1662 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1663 {
1664         struct sock *ska = socka->sk, *skb = sockb->sk;
1665
1666         /* Join our sockets back to back */
1667         sock_hold(ska);
1668         sock_hold(skb);
1669         unix_peer(ska) = skb;
1670         unix_peer(skb) = ska;
1671         init_peercred(ska);
1672         init_peercred(skb);
1673
1674         ska->sk_state = TCP_ESTABLISHED;
1675         skb->sk_state = TCP_ESTABLISHED;
1676         socka->state  = SS_CONNECTED;
1677         sockb->state  = SS_CONNECTED;
1678         return 0;
1679 }
1680
1681 static void unix_sock_inherit_flags(const struct socket *old,
1682                                     struct socket *new)
1683 {
1684         if (test_bit(SOCK_PASSCRED, &old->flags))
1685                 set_bit(SOCK_PASSCRED, &new->flags);
1686         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1687                 set_bit(SOCK_PASSPIDFD, &new->flags);
1688         if (test_bit(SOCK_PASSSEC, &old->flags))
1689                 set_bit(SOCK_PASSSEC, &new->flags);
1690 }
1691
1692 static int unix_accept(struct socket *sock, struct socket *newsock,
1693                        struct proto_accept_arg *arg)
1694 {
1695         struct sock *sk = sock->sk;
1696         struct sk_buff *skb;
1697         struct sock *tsk;
1698
1699         arg->err = -EOPNOTSUPP;
1700         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1701                 goto out;
1702
1703         arg->err = -EINVAL;
1704         if (sk->sk_state != TCP_LISTEN)
1705                 goto out;
1706
1707         /* If socket state is TCP_LISTEN it cannot change (for now...),
1708          * so that no locks are necessary.
1709          */
1710
1711         skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1712                                 &arg->err);
1713         if (!skb) {
1714                 /* This means receive shutdown. */
1715                 if (arg->err == 0)
1716                         arg->err = -EINVAL;
1717                 goto out;
1718         }
1719
1720         tsk = skb->sk;
1721         skb_free_datagram(sk, skb);
1722         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1723
1724         /* attach accepted sock to socket */
1725         unix_state_lock(tsk);
1726         unix_update_edges(unix_sk(tsk));
1727         newsock->state = SS_CONNECTED;
1728         unix_sock_inherit_flags(sock, newsock);
1729         sock_graft(tsk, newsock);
1730         unix_state_unlock(tsk);
1731         return 0;
1732
1733 out:
1734         return arg->err;
1735 }
1736
1737
1738 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1739 {
1740         struct sock *sk = sock->sk;
1741         struct unix_address *addr;
1742         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1743         int err = 0;
1744
1745         if (peer) {
1746                 sk = unix_peer_get(sk);
1747
1748                 err = -ENOTCONN;
1749                 if (!sk)
1750                         goto out;
1751                 err = 0;
1752         } else {
1753                 sock_hold(sk);
1754         }
1755
1756         addr = smp_load_acquire(&unix_sk(sk)->addr);
1757         if (!addr) {
1758                 sunaddr->sun_family = AF_UNIX;
1759                 sunaddr->sun_path[0] = 0;
1760                 err = offsetof(struct sockaddr_un, sun_path);
1761         } else {
1762                 err = addr->len;
1763                 memcpy(sunaddr, addr->name, addr->len);
1764
1765                 if (peer)
1766                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1767                                                CGROUP_UNIX_GETPEERNAME);
1768                 else
1769                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1770                                                CGROUP_UNIX_GETSOCKNAME);
1771         }
1772         sock_put(sk);
1773 out:
1774         return err;
1775 }
1776
1777 /* The "user->unix_inflight" variable is protected by the garbage
1778  * collection lock, and we just read it locklessly here. If you go
1779  * over the limit, there might be a tiny race in actually noticing
1780  * it across threads. Tough.
1781  */
1782 static inline bool too_many_unix_fds(struct task_struct *p)
1783 {
1784         struct user_struct *user = current_user();
1785
1786         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1787                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1788         return false;
1789 }
1790
1791 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1792 {
1793         if (too_many_unix_fds(current))
1794                 return -ETOOMANYREFS;
1795
1796         UNIXCB(skb).fp = scm->fp;
1797         scm->fp = NULL;
1798
1799         if (unix_prepare_fpl(UNIXCB(skb).fp))
1800                 return -ENOMEM;
1801
1802         return 0;
1803 }
1804
1805 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1806 {
1807         scm->fp = UNIXCB(skb).fp;
1808         UNIXCB(skb).fp = NULL;
1809
1810         unix_destroy_fpl(scm->fp);
1811 }
1812
1813 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1814 {
1815         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1816 }
1817
1818 static void unix_destruct_scm(struct sk_buff *skb)
1819 {
1820         struct scm_cookie scm;
1821
1822         memset(&scm, 0, sizeof(scm));
1823         scm.pid  = UNIXCB(skb).pid;
1824         if (UNIXCB(skb).fp)
1825                 unix_detach_fds(&scm, skb);
1826
1827         /* Alas, it calls VFS */
1828         /* So fscking what? fput() had been SMP-safe since the last Summer */
1829         scm_destroy(&scm);
1830         sock_wfree(skb);
1831 }
1832
1833 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1834 {
1835         int err = 0;
1836
1837         UNIXCB(skb).pid  = get_pid(scm->pid);
1838         UNIXCB(skb).uid = scm->creds.uid;
1839         UNIXCB(skb).gid = scm->creds.gid;
1840         UNIXCB(skb).fp = NULL;
1841         unix_get_secdata(scm, skb);
1842         if (scm->fp && send_fds)
1843                 err = unix_attach_fds(scm, skb);
1844
1845         skb->destructor = unix_destruct_scm;
1846         return err;
1847 }
1848
1849 static bool unix_passcred_enabled(const struct socket *sock,
1850                                   const struct sock *other)
1851 {
1852         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1853                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1854                !other->sk_socket ||
1855                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1856                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1857 }
1858
1859 /*
1860  * Some apps rely on write() giving SCM_CREDENTIALS
1861  * We include credentials if source or destination socket
1862  * asserted SOCK_PASSCRED.
1863  */
1864 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1865                             const struct sock *other)
1866 {
1867         if (UNIXCB(skb).pid)
1868                 return;
1869         if (unix_passcred_enabled(sock, other)) {
1870                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1871                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1872         }
1873 }
1874
1875 static bool unix_skb_scm_eq(struct sk_buff *skb,
1876                             struct scm_cookie *scm)
1877 {
1878         return UNIXCB(skb).pid == scm->pid &&
1879                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1880                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1881                unix_secdata_eq(scm, skb);
1882 }
1883
1884 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1885 {
1886         struct scm_fp_list *fp = UNIXCB(skb).fp;
1887         struct unix_sock *u = unix_sk(sk);
1888
1889         if (unlikely(fp && fp->count)) {
1890                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1891                 unix_add_edges(fp, u);
1892         }
1893 }
1894
1895 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1896 {
1897         struct scm_fp_list *fp = UNIXCB(skb).fp;
1898         struct unix_sock *u = unix_sk(sk);
1899
1900         if (unlikely(fp && fp->count)) {
1901                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1902                 unix_del_edges(fp);
1903         }
1904 }
1905
1906 /*
1907  *      Send AF_UNIX data.
1908  */
1909
1910 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1911                               size_t len)
1912 {
1913         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1914         struct sock *sk = sock->sk, *other = NULL;
1915         struct unix_sock *u = unix_sk(sk);
1916         struct scm_cookie scm;
1917         struct sk_buff *skb;
1918         int data_len = 0;
1919         int sk_locked;
1920         long timeo;
1921         int err;
1922
1923         err = scm_send(sock, msg, &scm, false);
1924         if (err < 0)
1925                 return err;
1926
1927         wait_for_unix_gc(scm.fp);
1928
1929         err = -EOPNOTSUPP;
1930         if (msg->msg_flags&MSG_OOB)
1931                 goto out;
1932
1933         if (msg->msg_namelen) {
1934                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1935                 if (err)
1936                         goto out;
1937
1938                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1939                                                             msg->msg_name,
1940                                                             &msg->msg_namelen,
1941                                                             NULL);
1942                 if (err)
1943                         goto out;
1944         } else {
1945                 sunaddr = NULL;
1946                 err = -ENOTCONN;
1947                 other = unix_peer_get(sk);
1948                 if (!other)
1949                         goto out;
1950         }
1951
1952         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1953              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1954                 err = unix_autobind(sk);
1955                 if (err)
1956                         goto out;
1957         }
1958
1959         err = -EMSGSIZE;
1960         if (len > sk->sk_sndbuf - 32)
1961                 goto out;
1962
1963         if (len > SKB_MAX_ALLOC) {
1964                 data_len = min_t(size_t,
1965                                  len - SKB_MAX_ALLOC,
1966                                  MAX_SKB_FRAGS * PAGE_SIZE);
1967                 data_len = PAGE_ALIGN(data_len);
1968
1969                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1970         }
1971
1972         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1973                                    msg->msg_flags & MSG_DONTWAIT, &err,
1974                                    PAGE_ALLOC_COSTLY_ORDER);
1975         if (skb == NULL)
1976                 goto out;
1977
1978         err = unix_scm_to_skb(&scm, skb, true);
1979         if (err < 0)
1980                 goto out_free;
1981
1982         skb_put(skb, len - data_len);
1983         skb->data_len = data_len;
1984         skb->len = len;
1985         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1986         if (err)
1987                 goto out_free;
1988
1989         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1990
1991 restart:
1992         if (!other) {
1993                 err = -ECONNRESET;
1994                 if (sunaddr == NULL)
1995                         goto out_free;
1996
1997                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1998                                         sk->sk_type);
1999                 if (IS_ERR(other)) {
2000                         err = PTR_ERR(other);
2001                         other = NULL;
2002                         goto out_free;
2003                 }
2004         }
2005
2006         if (sk_filter(other, skb) < 0) {
2007                 /* Toss the packet but do not return any error to the sender */
2008                 err = len;
2009                 goto out_free;
2010         }
2011
2012         sk_locked = 0;
2013         unix_state_lock(other);
2014 restart_locked:
2015         err = -EPERM;
2016         if (!unix_may_send(sk, other))
2017                 goto out_unlock;
2018
2019         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2020                 /*
2021                  *      Check with 1003.1g - what should
2022                  *      datagram error
2023                  */
2024                 unix_state_unlock(other);
2025                 sock_put(other);
2026
2027                 if (!sk_locked)
2028                         unix_state_lock(sk);
2029
2030                 err = 0;
2031                 if (sk->sk_type == SOCK_SEQPACKET) {
2032                         /* We are here only when racing with unix_release_sock()
2033                          * is clearing @other. Never change state to TCP_CLOSE
2034                          * unlike SOCK_DGRAM wants.
2035                          */
2036                         unix_state_unlock(sk);
2037                         err = -EPIPE;
2038                 } else if (unix_peer(sk) == other) {
2039                         unix_peer(sk) = NULL;
2040                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2041
2042                         sk->sk_state = TCP_CLOSE;
2043                         unix_state_unlock(sk);
2044
2045                         unix_dgram_disconnected(sk, other);
2046                         sock_put(other);
2047                         err = -ECONNREFUSED;
2048                 } else {
2049                         unix_state_unlock(sk);
2050                 }
2051
2052                 other = NULL;
2053                 if (err)
2054                         goto out_free;
2055                 goto restart;
2056         }
2057
2058         err = -EPIPE;
2059         if (other->sk_shutdown & RCV_SHUTDOWN)
2060                 goto out_unlock;
2061
2062         if (sk->sk_type != SOCK_SEQPACKET) {
2063                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2064                 if (err)
2065                         goto out_unlock;
2066         }
2067
2068         /* other == sk && unix_peer(other) != sk if
2069          * - unix_peer(sk) == NULL, destination address bound to sk
2070          * - unix_peer(sk) == sk by time of get but disconnected before lock
2071          */
2072         if (other != sk &&
2073             unlikely(unix_peer(other) != sk &&
2074             unix_recvq_full_lockless(other))) {
2075                 if (timeo) {
2076                         timeo = unix_wait_for_peer(other, timeo);
2077
2078                         err = sock_intr_errno(timeo);
2079                         if (signal_pending(current))
2080                                 goto out_free;
2081
2082                         goto restart;
2083                 }
2084
2085                 if (!sk_locked) {
2086                         unix_state_unlock(other);
2087                         unix_state_double_lock(sk, other);
2088                 }
2089
2090                 if (unix_peer(sk) != other ||
2091                     unix_dgram_peer_wake_me(sk, other)) {
2092                         err = -EAGAIN;
2093                         sk_locked = 1;
2094                         goto out_unlock;
2095                 }
2096
2097                 if (!sk_locked) {
2098                         sk_locked = 1;
2099                         goto restart_locked;
2100                 }
2101         }
2102
2103         if (unlikely(sk_locked))
2104                 unix_state_unlock(sk);
2105
2106         if (sock_flag(other, SOCK_RCVTSTAMP))
2107                 __net_timestamp(skb);
2108         maybe_add_creds(skb, sock, other);
2109         scm_stat_add(other, skb);
2110         skb_queue_tail(&other->sk_receive_queue, skb);
2111         unix_state_unlock(other);
2112         other->sk_data_ready(other);
2113         sock_put(other);
2114         scm_destroy(&scm);
2115         return len;
2116
2117 out_unlock:
2118         if (sk_locked)
2119                 unix_state_unlock(sk);
2120         unix_state_unlock(other);
2121 out_free:
2122         kfree_skb(skb);
2123 out:
2124         if (other)
2125                 sock_put(other);
2126         scm_destroy(&scm);
2127         return err;
2128 }
2129
2130 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2131  * bytes, and a minimum of a full page.
2132  */
2133 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2134
2135 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2136 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2137                      struct scm_cookie *scm, bool fds_sent)
2138 {
2139         struct unix_sock *ousk = unix_sk(other);
2140         struct sk_buff *skb;
2141         int err = 0;
2142
2143         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2144
2145         if (!skb)
2146                 return err;
2147
2148         err = unix_scm_to_skb(scm, skb, !fds_sent);
2149         if (err < 0) {
2150                 kfree_skb(skb);
2151                 return err;
2152         }
2153         skb_put(skb, 1);
2154         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2155
2156         if (err) {
2157                 kfree_skb(skb);
2158                 return err;
2159         }
2160
2161         unix_state_lock(other);
2162
2163         if (sock_flag(other, SOCK_DEAD) ||
2164             (other->sk_shutdown & RCV_SHUTDOWN)) {
2165                 unix_state_unlock(other);
2166                 kfree_skb(skb);
2167                 return -EPIPE;
2168         }
2169
2170         maybe_add_creds(skb, sock, other);
2171         skb_get(skb);
2172
2173         scm_stat_add(other, skb);
2174
2175         spin_lock(&other->sk_receive_queue.lock);
2176         if (ousk->oob_skb)
2177                 consume_skb(ousk->oob_skb);
2178         WRITE_ONCE(ousk->oob_skb, skb);
2179         __skb_queue_tail(&other->sk_receive_queue, skb);
2180         spin_unlock(&other->sk_receive_queue.lock);
2181
2182         sk_send_sigurg(other);
2183         unix_state_unlock(other);
2184         other->sk_data_ready(other);
2185
2186         return err;
2187 }
2188 #endif
2189
2190 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2191                                size_t len)
2192 {
2193         struct sock *sk = sock->sk;
2194         struct sock *other = NULL;
2195         int err, size;
2196         struct sk_buff *skb;
2197         int sent = 0;
2198         struct scm_cookie scm;
2199         bool fds_sent = false;
2200         int data_len;
2201
2202         err = scm_send(sock, msg, &scm, false);
2203         if (err < 0)
2204                 return err;
2205
2206         wait_for_unix_gc(scm.fp);
2207
2208         err = -EOPNOTSUPP;
2209         if (msg->msg_flags & MSG_OOB) {
2210 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2211                 if (len)
2212                         len--;
2213                 else
2214 #endif
2215                         goto out_err;
2216         }
2217
2218         if (msg->msg_namelen) {
2219                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2220                 goto out_err;
2221         } else {
2222                 err = -ENOTCONN;
2223                 other = unix_peer(sk);
2224                 if (!other)
2225                         goto out_err;
2226         }
2227
2228         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2229                 goto pipe_err;
2230
2231         while (sent < len) {
2232                 size = len - sent;
2233
2234                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2235                         skb = sock_alloc_send_pskb(sk, 0, 0,
2236                                                    msg->msg_flags & MSG_DONTWAIT,
2237                                                    &err, 0);
2238                 } else {
2239                         /* Keep two messages in the pipe so it schedules better */
2240                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2241
2242                         /* allow fallback to order-0 allocations */
2243                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2244
2245                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2246
2247                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2248
2249                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2250                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2251                                                    get_order(UNIX_SKB_FRAGS_SZ));
2252                 }
2253                 if (!skb)
2254                         goto out_err;
2255
2256                 /* Only send the fds in the first buffer */
2257                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2258                 if (err < 0) {
2259                         kfree_skb(skb);
2260                         goto out_err;
2261                 }
2262                 fds_sent = true;
2263
2264                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2265                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2266                                                    sk->sk_allocation);
2267                         if (err < 0) {
2268                                 kfree_skb(skb);
2269                                 goto out_err;
2270                         }
2271                         size = err;
2272                         refcount_add(size, &sk->sk_wmem_alloc);
2273                 } else {
2274                         skb_put(skb, size - data_len);
2275                         skb->data_len = data_len;
2276                         skb->len = size;
2277                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2278                         if (err) {
2279                                 kfree_skb(skb);
2280                                 goto out_err;
2281                         }
2282                 }
2283
2284                 unix_state_lock(other);
2285
2286                 if (sock_flag(other, SOCK_DEAD) ||
2287                     (other->sk_shutdown & RCV_SHUTDOWN))
2288                         goto pipe_err_free;
2289
2290                 maybe_add_creds(skb, sock, other);
2291                 scm_stat_add(other, skb);
2292                 skb_queue_tail(&other->sk_receive_queue, skb);
2293                 unix_state_unlock(other);
2294                 other->sk_data_ready(other);
2295                 sent += size;
2296         }
2297
2298 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2299         if (msg->msg_flags & MSG_OOB) {
2300                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2301                 if (err)
2302                         goto out_err;
2303                 sent++;
2304         }
2305 #endif
2306
2307         scm_destroy(&scm);
2308
2309         return sent;
2310
2311 pipe_err_free:
2312         unix_state_unlock(other);
2313         kfree_skb(skb);
2314 pipe_err:
2315         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2316                 send_sig(SIGPIPE, current, 0);
2317         err = -EPIPE;
2318 out_err:
2319         scm_destroy(&scm);
2320         return sent ? : err;
2321 }
2322
2323 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2324                                   size_t len)
2325 {
2326         int err;
2327         struct sock *sk = sock->sk;
2328
2329         err = sock_error(sk);
2330         if (err)
2331                 return err;
2332
2333         if (sk->sk_state != TCP_ESTABLISHED)
2334                 return -ENOTCONN;
2335
2336         if (msg->msg_namelen)
2337                 msg->msg_namelen = 0;
2338
2339         return unix_dgram_sendmsg(sock, msg, len);
2340 }
2341
2342 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2343                                   size_t size, int flags)
2344 {
2345         struct sock *sk = sock->sk;
2346
2347         if (sk->sk_state != TCP_ESTABLISHED)
2348                 return -ENOTCONN;
2349
2350         return unix_dgram_recvmsg(sock, msg, size, flags);
2351 }
2352
2353 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2354 {
2355         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2356
2357         if (addr) {
2358                 msg->msg_namelen = addr->len;
2359                 memcpy(msg->msg_name, addr->name, addr->len);
2360         }
2361 }
2362
2363 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2364                          int flags)
2365 {
2366         struct scm_cookie scm;
2367         struct socket *sock = sk->sk_socket;
2368         struct unix_sock *u = unix_sk(sk);
2369         struct sk_buff *skb, *last;
2370         long timeo;
2371         int skip;
2372         int err;
2373
2374         err = -EOPNOTSUPP;
2375         if (flags&MSG_OOB)
2376                 goto out;
2377
2378         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2379
2380         do {
2381                 mutex_lock(&u->iolock);
2382
2383                 skip = sk_peek_offset(sk, flags);
2384                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2385                                               &skip, &err, &last);
2386                 if (skb) {
2387                         if (!(flags & MSG_PEEK))
2388                                 scm_stat_del(sk, skb);
2389                         break;
2390                 }
2391
2392                 mutex_unlock(&u->iolock);
2393
2394                 if (err != -EAGAIN)
2395                         break;
2396         } while (timeo &&
2397                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2398                                               &err, &timeo, last));
2399
2400         if (!skb) { /* implies iolock unlocked */
2401                 unix_state_lock(sk);
2402                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2403                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2404                     (sk->sk_shutdown & RCV_SHUTDOWN))
2405                         err = 0;
2406                 unix_state_unlock(sk);
2407                 goto out;
2408         }
2409
2410         if (wq_has_sleeper(&u->peer_wait))
2411                 wake_up_interruptible_sync_poll(&u->peer_wait,
2412                                                 EPOLLOUT | EPOLLWRNORM |
2413                                                 EPOLLWRBAND);
2414
2415         if (msg->msg_name) {
2416                 unix_copy_addr(msg, skb->sk);
2417
2418                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2419                                                       msg->msg_name,
2420                                                       &msg->msg_namelen);
2421         }
2422
2423         if (size > skb->len - skip)
2424                 size = skb->len - skip;
2425         else if (size < skb->len - skip)
2426                 msg->msg_flags |= MSG_TRUNC;
2427
2428         err = skb_copy_datagram_msg(skb, skip, msg, size);
2429         if (err)
2430                 goto out_free;
2431
2432         if (sock_flag(sk, SOCK_RCVTSTAMP))
2433                 __sock_recv_timestamp(msg, sk, skb);
2434
2435         memset(&scm, 0, sizeof(scm));
2436
2437         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2438         unix_set_secdata(&scm, skb);
2439
2440         if (!(flags & MSG_PEEK)) {
2441                 if (UNIXCB(skb).fp)
2442                         unix_detach_fds(&scm, skb);
2443
2444                 sk_peek_offset_bwd(sk, skb->len);
2445         } else {
2446                 /* It is questionable: on PEEK we could:
2447                    - do not return fds - good, but too simple 8)
2448                    - return fds, and do not return them on read (old strategy,
2449                      apparently wrong)
2450                    - clone fds (I chose it for now, it is the most universal
2451                      solution)
2452
2453                    POSIX 1003.1g does not actually define this clearly
2454                    at all. POSIX 1003.1g doesn't define a lot of things
2455                    clearly however!
2456
2457                 */
2458
2459                 sk_peek_offset_fwd(sk, size);
2460
2461                 if (UNIXCB(skb).fp)
2462                         unix_peek_fds(&scm, skb);
2463         }
2464         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2465
2466         scm_recv_unix(sock, msg, &scm, flags);
2467
2468 out_free:
2469         skb_free_datagram(sk, skb);
2470         mutex_unlock(&u->iolock);
2471 out:
2472         return err;
2473 }
2474
2475 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2476                               int flags)
2477 {
2478         struct sock *sk = sock->sk;
2479
2480 #ifdef CONFIG_BPF_SYSCALL
2481         const struct proto *prot = READ_ONCE(sk->sk_prot);
2482
2483         if (prot != &unix_dgram_proto)
2484                 return prot->recvmsg(sk, msg, size, flags, NULL);
2485 #endif
2486         return __unix_dgram_recvmsg(sk, msg, size, flags);
2487 }
2488
2489 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2490 {
2491         struct unix_sock *u = unix_sk(sk);
2492         struct sk_buff *skb;
2493         int err;
2494
2495         mutex_lock(&u->iolock);
2496         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2497         mutex_unlock(&u->iolock);
2498         if (!skb)
2499                 return err;
2500
2501         return recv_actor(sk, skb);
2502 }
2503
2504 /*
2505  *      Sleep until more data has arrived. But check for races..
2506  */
2507 static long unix_stream_data_wait(struct sock *sk, long timeo,
2508                                   struct sk_buff *last, unsigned int last_len,
2509                                   bool freezable)
2510 {
2511         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2512         struct sk_buff *tail;
2513         DEFINE_WAIT(wait);
2514
2515         unix_state_lock(sk);
2516
2517         for (;;) {
2518                 prepare_to_wait(sk_sleep(sk), &wait, state);
2519
2520                 tail = skb_peek_tail(&sk->sk_receive_queue);
2521                 if (tail != last ||
2522                     (tail && tail->len != last_len) ||
2523                     sk->sk_err ||
2524                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2525                     signal_pending(current) ||
2526                     !timeo)
2527                         break;
2528
2529                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2530                 unix_state_unlock(sk);
2531                 timeo = schedule_timeout(timeo);
2532                 unix_state_lock(sk);
2533
2534                 if (sock_flag(sk, SOCK_DEAD))
2535                         break;
2536
2537                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2538         }
2539
2540         finish_wait(sk_sleep(sk), &wait);
2541         unix_state_unlock(sk);
2542         return timeo;
2543 }
2544
2545 static unsigned int unix_skb_len(const struct sk_buff *skb)
2546 {
2547         return skb->len - UNIXCB(skb).consumed;
2548 }
2549
2550 struct unix_stream_read_state {
2551         int (*recv_actor)(struct sk_buff *, int, int,
2552                           struct unix_stream_read_state *);
2553         struct socket *socket;
2554         struct msghdr *msg;
2555         struct pipe_inode_info *pipe;
2556         size_t size;
2557         int flags;
2558         unsigned int splice_flags;
2559 };
2560
2561 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2562 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2563 {
2564         struct socket *sock = state->socket;
2565         struct sock *sk = sock->sk;
2566         struct unix_sock *u = unix_sk(sk);
2567         int chunk = 1;
2568         struct sk_buff *oob_skb;
2569
2570         mutex_lock(&u->iolock);
2571         unix_state_lock(sk);
2572         spin_lock(&sk->sk_receive_queue.lock);
2573
2574         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2575                 spin_unlock(&sk->sk_receive_queue.lock);
2576                 unix_state_unlock(sk);
2577                 mutex_unlock(&u->iolock);
2578                 return -EINVAL;
2579         }
2580
2581         oob_skb = u->oob_skb;
2582
2583         if (!(state->flags & MSG_PEEK))
2584                 WRITE_ONCE(u->oob_skb, NULL);
2585         else
2586                 skb_get(oob_skb);
2587
2588         spin_unlock(&sk->sk_receive_queue.lock);
2589         unix_state_unlock(sk);
2590
2591         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2592
2593         if (!(state->flags & MSG_PEEK))
2594                 UNIXCB(oob_skb).consumed += 1;
2595
2596         consume_skb(oob_skb);
2597
2598         mutex_unlock(&u->iolock);
2599
2600         if (chunk < 0)
2601                 return -EFAULT;
2602
2603         state->msg->msg_flags |= MSG_OOB;
2604         return 1;
2605 }
2606
2607 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2608                                   int flags, int copied)
2609 {
2610         struct unix_sock *u = unix_sk(sk);
2611
2612         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2613                 skb_unlink(skb, &sk->sk_receive_queue);
2614                 consume_skb(skb);
2615                 skb = NULL;
2616         } else {
2617                 struct sk_buff *unlinked_skb = NULL;
2618
2619                 spin_lock(&sk->sk_receive_queue.lock);
2620
2621                 if (skb == u->oob_skb) {
2622                         if (copied) {
2623                                 skb = NULL;
2624                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2625                                 if (!(flags & MSG_PEEK)) {
2626                                         WRITE_ONCE(u->oob_skb, NULL);
2627                                         consume_skb(skb);
2628                                 }
2629                         } else if (flags & MSG_PEEK) {
2630                                 skb = NULL;
2631                         } else {
2632                                 __skb_unlink(skb, &sk->sk_receive_queue);
2633                                 WRITE_ONCE(u->oob_skb, NULL);
2634                                 unlinked_skb = skb;
2635                                 skb = skb_peek(&sk->sk_receive_queue);
2636                         }
2637                 }
2638
2639                 spin_unlock(&sk->sk_receive_queue.lock);
2640
2641                 if (unlinked_skb) {
2642                         WARN_ON_ONCE(skb_unref(unlinked_skb));
2643                         kfree_skb(unlinked_skb);
2644                 }
2645         }
2646         return skb;
2647 }
2648 #endif
2649
2650 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2651 {
2652         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2653                 return -ENOTCONN;
2654
2655         return unix_read_skb(sk, recv_actor);
2656 }
2657
2658 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2659                                     bool freezable)
2660 {
2661         struct scm_cookie scm;
2662         struct socket *sock = state->socket;
2663         struct sock *sk = sock->sk;
2664         struct unix_sock *u = unix_sk(sk);
2665         int copied = 0;
2666         int flags = state->flags;
2667         int noblock = flags & MSG_DONTWAIT;
2668         bool check_creds = false;
2669         int target;
2670         int err = 0;
2671         long timeo;
2672         int skip;
2673         size_t size = state->size;
2674         unsigned int last_len;
2675
2676         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2677                 err = -EINVAL;
2678                 goto out;
2679         }
2680
2681         if (unlikely(flags & MSG_OOB)) {
2682                 err = -EOPNOTSUPP;
2683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2684                 err = unix_stream_recv_urg(state);
2685 #endif
2686                 goto out;
2687         }
2688
2689         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2690         timeo = sock_rcvtimeo(sk, noblock);
2691
2692         memset(&scm, 0, sizeof(scm));
2693
2694         /* Lock the socket to prevent queue disordering
2695          * while sleeps in memcpy_tomsg
2696          */
2697         mutex_lock(&u->iolock);
2698
2699         skip = max(sk_peek_offset(sk, flags), 0);
2700
2701         do {
2702                 int chunk;
2703                 bool drop_skb;
2704                 struct sk_buff *skb, *last;
2705
2706 redo:
2707                 unix_state_lock(sk);
2708                 if (sock_flag(sk, SOCK_DEAD)) {
2709                         err = -ECONNRESET;
2710                         goto unlock;
2711                 }
2712                 last = skb = skb_peek(&sk->sk_receive_queue);
2713                 last_len = last ? last->len : 0;
2714
2715 again:
2716 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2717                 if (skb) {
2718                         skb = manage_oob(skb, sk, flags, copied);
2719                         if (!skb && copied) {
2720                                 unix_state_unlock(sk);
2721                                 break;
2722                         }
2723                 }
2724 #endif
2725                 if (skb == NULL) {
2726                         if (copied >= target)
2727                                 goto unlock;
2728
2729                         /*
2730                          *      POSIX 1003.1g mandates this order.
2731                          */
2732
2733                         err = sock_error(sk);
2734                         if (err)
2735                                 goto unlock;
2736                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2737                                 goto unlock;
2738
2739                         unix_state_unlock(sk);
2740                         if (!timeo) {
2741                                 err = -EAGAIN;
2742                                 break;
2743                         }
2744
2745                         mutex_unlock(&u->iolock);
2746
2747                         timeo = unix_stream_data_wait(sk, timeo, last,
2748                                                       last_len, freezable);
2749
2750                         if (signal_pending(current)) {
2751                                 err = sock_intr_errno(timeo);
2752                                 scm_destroy(&scm);
2753                                 goto out;
2754                         }
2755
2756                         mutex_lock(&u->iolock);
2757                         goto redo;
2758 unlock:
2759                         unix_state_unlock(sk);
2760                         break;
2761                 }
2762
2763                 while (skip >= unix_skb_len(skb)) {
2764                         skip -= unix_skb_len(skb);
2765                         last = skb;
2766                         last_len = skb->len;
2767                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2768                         if (!skb)
2769                                 goto again;
2770                 }
2771
2772                 unix_state_unlock(sk);
2773
2774                 if (check_creds) {
2775                         /* Never glue messages from different writers */
2776                         if (!unix_skb_scm_eq(skb, &scm))
2777                                 break;
2778                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2779                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2780                         /* Copy credentials */
2781                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2782                         unix_set_secdata(&scm, skb);
2783                         check_creds = true;
2784                 }
2785
2786                 /* Copy address just once */
2787                 if (state->msg && state->msg->msg_name) {
2788                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2789                                          state->msg->msg_name);
2790                         unix_copy_addr(state->msg, skb->sk);
2791
2792                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2793                                                               state->msg->msg_name,
2794                                                               &state->msg->msg_namelen);
2795
2796                         sunaddr = NULL;
2797                 }
2798
2799                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2800                 skb_get(skb);
2801                 chunk = state->recv_actor(skb, skip, chunk, state);
2802                 drop_skb = !unix_skb_len(skb);
2803                 /* skb is only safe to use if !drop_skb */
2804                 consume_skb(skb);
2805                 if (chunk < 0) {
2806                         if (copied == 0)
2807                                 copied = -EFAULT;
2808                         break;
2809                 }
2810                 copied += chunk;
2811                 size -= chunk;
2812
2813                 if (drop_skb) {
2814                         /* the skb was touched by a concurrent reader;
2815                          * we should not expect anything from this skb
2816                          * anymore and assume it invalid - we can be
2817                          * sure it was dropped from the socket queue
2818                          *
2819                          * let's report a short read
2820                          */
2821                         err = 0;
2822                         break;
2823                 }
2824
2825                 /* Mark read part of skb as used */
2826                 if (!(flags & MSG_PEEK)) {
2827                         UNIXCB(skb).consumed += chunk;
2828
2829                         sk_peek_offset_bwd(sk, chunk);
2830
2831                         if (UNIXCB(skb).fp) {
2832                                 scm_stat_del(sk, skb);
2833                                 unix_detach_fds(&scm, skb);
2834                         }
2835
2836                         if (unix_skb_len(skb))
2837                                 break;
2838
2839                         skb_unlink(skb, &sk->sk_receive_queue);
2840                         consume_skb(skb);
2841
2842                         if (scm.fp)
2843                                 break;
2844                 } else {
2845                         /* It is questionable, see note in unix_dgram_recvmsg.
2846                          */
2847                         if (UNIXCB(skb).fp)
2848                                 unix_peek_fds(&scm, skb);
2849
2850                         sk_peek_offset_fwd(sk, chunk);
2851
2852                         if (UNIXCB(skb).fp)
2853                                 break;
2854
2855                         skip = 0;
2856                         last = skb;
2857                         last_len = skb->len;
2858                         unix_state_lock(sk);
2859                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2860                         if (skb)
2861                                 goto again;
2862                         unix_state_unlock(sk);
2863                         break;
2864                 }
2865         } while (size);
2866
2867         mutex_unlock(&u->iolock);
2868         if (state->msg)
2869                 scm_recv_unix(sock, state->msg, &scm, flags);
2870         else
2871                 scm_destroy(&scm);
2872 out:
2873         return copied ? : err;
2874 }
2875
2876 static int unix_stream_read_actor(struct sk_buff *skb,
2877                                   int skip, int chunk,
2878                                   struct unix_stream_read_state *state)
2879 {
2880         int ret;
2881
2882         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2883                                     state->msg, chunk);
2884         return ret ?: chunk;
2885 }
2886
2887 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2888                           size_t size, int flags)
2889 {
2890         struct unix_stream_read_state state = {
2891                 .recv_actor = unix_stream_read_actor,
2892                 .socket = sk->sk_socket,
2893                 .msg = msg,
2894                 .size = size,
2895                 .flags = flags
2896         };
2897
2898         return unix_stream_read_generic(&state, true);
2899 }
2900
2901 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2902                                size_t size, int flags)
2903 {
2904         struct unix_stream_read_state state = {
2905                 .recv_actor = unix_stream_read_actor,
2906                 .socket = sock,
2907                 .msg = msg,
2908                 .size = size,
2909                 .flags = flags
2910         };
2911
2912 #ifdef CONFIG_BPF_SYSCALL
2913         struct sock *sk = sock->sk;
2914         const struct proto *prot = READ_ONCE(sk->sk_prot);
2915
2916         if (prot != &unix_stream_proto)
2917                 return prot->recvmsg(sk, msg, size, flags, NULL);
2918 #endif
2919         return unix_stream_read_generic(&state, true);
2920 }
2921
2922 static int unix_stream_splice_actor(struct sk_buff *skb,
2923                                     int skip, int chunk,
2924                                     struct unix_stream_read_state *state)
2925 {
2926         return skb_splice_bits(skb, state->socket->sk,
2927                                UNIXCB(skb).consumed + skip,
2928                                state->pipe, chunk, state->splice_flags);
2929 }
2930
2931 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2932                                        struct pipe_inode_info *pipe,
2933                                        size_t size, unsigned int flags)
2934 {
2935         struct unix_stream_read_state state = {
2936                 .recv_actor = unix_stream_splice_actor,
2937                 .socket = sock,
2938                 .pipe = pipe,
2939                 .size = size,
2940                 .splice_flags = flags,
2941         };
2942
2943         if (unlikely(*ppos))
2944                 return -ESPIPE;
2945
2946         if (sock->file->f_flags & O_NONBLOCK ||
2947             flags & SPLICE_F_NONBLOCK)
2948                 state.flags = MSG_DONTWAIT;
2949
2950         return unix_stream_read_generic(&state, false);
2951 }
2952
2953 static int unix_shutdown(struct socket *sock, int mode)
2954 {
2955         struct sock *sk = sock->sk;
2956         struct sock *other;
2957
2958         if (mode < SHUT_RD || mode > SHUT_RDWR)
2959                 return -EINVAL;
2960         /* This maps:
2961          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2962          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2963          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2964          */
2965         ++mode;
2966
2967         unix_state_lock(sk);
2968         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2969         other = unix_peer(sk);
2970         if (other)
2971                 sock_hold(other);
2972         unix_state_unlock(sk);
2973         sk->sk_state_change(sk);
2974
2975         if (other &&
2976                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2977
2978                 int peer_mode = 0;
2979                 const struct proto *prot = READ_ONCE(other->sk_prot);
2980
2981                 if (prot->unhash)
2982                         prot->unhash(other);
2983                 if (mode&RCV_SHUTDOWN)
2984                         peer_mode |= SEND_SHUTDOWN;
2985                 if (mode&SEND_SHUTDOWN)
2986                         peer_mode |= RCV_SHUTDOWN;
2987                 unix_state_lock(other);
2988                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2989                 unix_state_unlock(other);
2990                 other->sk_state_change(other);
2991                 if (peer_mode == SHUTDOWN_MASK)
2992                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2993                 else if (peer_mode & RCV_SHUTDOWN)
2994                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2995         }
2996         if (other)
2997                 sock_put(other);
2998
2999         return 0;
3000 }
3001
3002 long unix_inq_len(struct sock *sk)
3003 {
3004         struct sk_buff *skb;
3005         long amount = 0;
3006
3007         if (sk->sk_state == TCP_LISTEN)
3008                 return -EINVAL;
3009
3010         spin_lock(&sk->sk_receive_queue.lock);
3011         if (sk->sk_type == SOCK_STREAM ||
3012             sk->sk_type == SOCK_SEQPACKET) {
3013                 skb_queue_walk(&sk->sk_receive_queue, skb)
3014                         amount += unix_skb_len(skb);
3015         } else {
3016                 skb = skb_peek(&sk->sk_receive_queue);
3017                 if (skb)
3018                         amount = skb->len;
3019         }
3020         spin_unlock(&sk->sk_receive_queue.lock);
3021
3022         return amount;
3023 }
3024 EXPORT_SYMBOL_GPL(unix_inq_len);
3025
3026 long unix_outq_len(struct sock *sk)
3027 {
3028         return sk_wmem_alloc_get(sk);
3029 }
3030 EXPORT_SYMBOL_GPL(unix_outq_len);
3031
3032 static int unix_open_file(struct sock *sk)
3033 {
3034         struct path path;
3035         struct file *f;
3036         int fd;
3037
3038         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3039                 return -EPERM;
3040
3041         if (!smp_load_acquire(&unix_sk(sk)->addr))
3042                 return -ENOENT;
3043
3044         path = unix_sk(sk)->path;
3045         if (!path.dentry)
3046                 return -ENOENT;
3047
3048         path_get(&path);
3049
3050         fd = get_unused_fd_flags(O_CLOEXEC);
3051         if (fd < 0)
3052                 goto out;
3053
3054         f = dentry_open(&path, O_PATH, current_cred());
3055         if (IS_ERR(f)) {
3056                 put_unused_fd(fd);
3057                 fd = PTR_ERR(f);
3058                 goto out;
3059         }
3060
3061         fd_install(fd, f);
3062 out:
3063         path_put(&path);
3064
3065         return fd;
3066 }
3067
3068 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3069 {
3070         struct sock *sk = sock->sk;
3071         long amount = 0;
3072         int err;
3073
3074         switch (cmd) {
3075         case SIOCOUTQ:
3076                 amount = unix_outq_len(sk);
3077                 err = put_user(amount, (int __user *)arg);
3078                 break;
3079         case SIOCINQ:
3080                 amount = unix_inq_len(sk);
3081                 if (amount < 0)
3082                         err = amount;
3083                 else
3084                         err = put_user(amount, (int __user *)arg);
3085                 break;
3086         case SIOCUNIXFILE:
3087                 err = unix_open_file(sk);
3088                 break;
3089 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3090         case SIOCATMARK:
3091                 {
3092                         struct sk_buff *skb;
3093                         int answ = 0;
3094
3095                         skb = skb_peek(&sk->sk_receive_queue);
3096                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3097                                 answ = 1;
3098                         err = put_user(answ, (int __user *)arg);
3099                 }
3100                 break;
3101 #endif
3102         default:
3103                 err = -ENOIOCTLCMD;
3104                 break;
3105         }
3106         return err;
3107 }
3108
3109 #ifdef CONFIG_COMPAT
3110 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3111 {
3112         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3113 }
3114 #endif
3115
3116 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3117 {
3118         struct sock *sk = sock->sk;
3119         __poll_t mask;
3120         u8 shutdown;
3121
3122         sock_poll_wait(file, sock, wait);
3123         mask = 0;
3124         shutdown = READ_ONCE(sk->sk_shutdown);
3125
3126         /* exceptional events? */
3127         if (READ_ONCE(sk->sk_err))
3128                 mask |= EPOLLERR;
3129         if (shutdown == SHUTDOWN_MASK)
3130                 mask |= EPOLLHUP;
3131         if (shutdown & RCV_SHUTDOWN)
3132                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3133
3134         /* readable? */
3135         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3136                 mask |= EPOLLIN | EPOLLRDNORM;
3137         if (sk_is_readable(sk))
3138                 mask |= EPOLLIN | EPOLLRDNORM;
3139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3140         if (READ_ONCE(unix_sk(sk)->oob_skb))
3141                 mask |= EPOLLPRI;
3142 #endif
3143
3144         /* Connection-based need to check for termination and startup */
3145         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3146             sk->sk_state == TCP_CLOSE)
3147                 mask |= EPOLLHUP;
3148
3149         /*
3150          * we set writable also when the other side has shut down the
3151          * connection. This prevents stuck sockets.
3152          */
3153         if (unix_writable(sk))
3154                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3155
3156         return mask;
3157 }
3158
3159 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3160                                     poll_table *wait)
3161 {
3162         struct sock *sk = sock->sk, *other;
3163         unsigned int writable;
3164         __poll_t mask;
3165         u8 shutdown;
3166
3167         sock_poll_wait(file, sock, wait);
3168         mask = 0;
3169         shutdown = READ_ONCE(sk->sk_shutdown);
3170
3171         /* exceptional events? */
3172         if (READ_ONCE(sk->sk_err) ||
3173             !skb_queue_empty_lockless(&sk->sk_error_queue))
3174                 mask |= EPOLLERR |
3175                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3176
3177         if (shutdown & RCV_SHUTDOWN)
3178                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3179         if (shutdown == SHUTDOWN_MASK)
3180                 mask |= EPOLLHUP;
3181
3182         /* readable? */
3183         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3184                 mask |= EPOLLIN | EPOLLRDNORM;
3185         if (sk_is_readable(sk))
3186                 mask |= EPOLLIN | EPOLLRDNORM;
3187
3188         /* Connection-based need to check for termination and startup */
3189         if (sk->sk_type == SOCK_SEQPACKET) {
3190                 if (sk->sk_state == TCP_CLOSE)
3191                         mask |= EPOLLHUP;
3192                 /* connection hasn't started yet? */
3193                 if (sk->sk_state == TCP_SYN_SENT)
3194                         return mask;
3195         }
3196
3197         /* No write status requested, avoid expensive OUT tests. */
3198         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3199                 return mask;
3200
3201         writable = unix_writable(sk);
3202         if (writable) {
3203                 unix_state_lock(sk);
3204
3205                 other = unix_peer(sk);
3206                 if (other && unix_peer(other) != sk &&
3207                     unix_recvq_full_lockless(other) &&
3208                     unix_dgram_peer_wake_me(sk, other))
3209                         writable = 0;
3210
3211                 unix_state_unlock(sk);
3212         }
3213
3214         if (writable)
3215                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3216         else
3217                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3218
3219         return mask;
3220 }
3221
3222 #ifdef CONFIG_PROC_FS
3223
3224 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3225
3226 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3227 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3228 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3229
3230 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3231 {
3232         unsigned long offset = get_offset(*pos);
3233         unsigned long bucket = get_bucket(*pos);
3234         unsigned long count = 0;
3235         struct sock *sk;
3236
3237         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3238              sk; sk = sk_next(sk)) {
3239                 if (++count == offset)
3240                         break;
3241         }
3242
3243         return sk;
3244 }
3245
3246 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3247 {
3248         unsigned long bucket = get_bucket(*pos);
3249         struct net *net = seq_file_net(seq);
3250         struct sock *sk;
3251
3252         while (bucket < UNIX_HASH_SIZE) {
3253                 spin_lock(&net->unx.table.locks[bucket]);
3254
3255                 sk = unix_from_bucket(seq, pos);
3256                 if (sk)
3257                         return sk;
3258
3259                 spin_unlock(&net->unx.table.locks[bucket]);
3260
3261                 *pos = set_bucket_offset(++bucket, 1);
3262         }
3263
3264         return NULL;
3265 }
3266
3267 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3268                                   loff_t *pos)
3269 {
3270         unsigned long bucket = get_bucket(*pos);
3271
3272         sk = sk_next(sk);
3273         if (sk)
3274                 return sk;
3275
3276
3277         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3278
3279         *pos = set_bucket_offset(++bucket, 1);
3280
3281         return unix_get_first(seq, pos);
3282 }
3283
3284 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3285 {
3286         if (!*pos)
3287                 return SEQ_START_TOKEN;
3288
3289         return unix_get_first(seq, pos);
3290 }
3291
3292 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3293 {
3294         ++*pos;
3295
3296         if (v == SEQ_START_TOKEN)
3297                 return unix_get_first(seq, pos);
3298
3299         return unix_get_next(seq, v, pos);
3300 }
3301
3302 static void unix_seq_stop(struct seq_file *seq, void *v)
3303 {
3304         struct sock *sk = v;
3305
3306         if (sk)
3307                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3308 }
3309
3310 static int unix_seq_show(struct seq_file *seq, void *v)
3311 {
3312
3313         if (v == SEQ_START_TOKEN)
3314                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3315                          "Inode Path\n");
3316         else {
3317                 struct sock *s = v;
3318                 struct unix_sock *u = unix_sk(s);
3319                 unix_state_lock(s);
3320
3321                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3322                         s,
3323                         refcount_read(&s->sk_refcnt),
3324                         0,
3325                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3326                         s->sk_type,
3327                         s->sk_socket ?
3328                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3329                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3330                         sock_i_ino(s));
3331
3332                 if (u->addr) {  // under a hash table lock here
3333                         int i, len;
3334                         seq_putc(seq, ' ');
3335
3336                         i = 0;
3337                         len = u->addr->len -
3338                                 offsetof(struct sockaddr_un, sun_path);
3339                         if (u->addr->name->sun_path[0]) {
3340                                 len--;
3341                         } else {
3342                                 seq_putc(seq, '@');
3343                                 i++;
3344                         }
3345                         for ( ; i < len; i++)
3346                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3347                                          '@');
3348                 }
3349                 unix_state_unlock(s);
3350                 seq_putc(seq, '\n');
3351         }
3352
3353         return 0;
3354 }
3355
3356 static const struct seq_operations unix_seq_ops = {
3357         .start  = unix_seq_start,
3358         .next   = unix_seq_next,
3359         .stop   = unix_seq_stop,
3360         .show   = unix_seq_show,
3361 };
3362
3363 #ifdef CONFIG_BPF_SYSCALL
3364 struct bpf_unix_iter_state {
3365         struct seq_net_private p;
3366         unsigned int cur_sk;
3367         unsigned int end_sk;
3368         unsigned int max_sk;
3369         struct sock **batch;
3370         bool st_bucket_done;
3371 };
3372
3373 struct bpf_iter__unix {
3374         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3375         __bpf_md_ptr(struct unix_sock *, unix_sk);
3376         uid_t uid __aligned(8);
3377 };
3378
3379 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3380                               struct unix_sock *unix_sk, uid_t uid)
3381 {
3382         struct bpf_iter__unix ctx;
3383
3384         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3385         ctx.meta = meta;
3386         ctx.unix_sk = unix_sk;
3387         ctx.uid = uid;
3388         return bpf_iter_run_prog(prog, &ctx);
3389 }
3390
3391 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3392
3393 {
3394         struct bpf_unix_iter_state *iter = seq->private;
3395         unsigned int expected = 1;
3396         struct sock *sk;
3397
3398         sock_hold(start_sk);
3399         iter->batch[iter->end_sk++] = start_sk;
3400
3401         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3402                 if (iter->end_sk < iter->max_sk) {
3403                         sock_hold(sk);
3404                         iter->batch[iter->end_sk++] = sk;
3405                 }
3406
3407                 expected++;
3408         }
3409
3410         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3411
3412         return expected;
3413 }
3414
3415 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3416 {
3417         while (iter->cur_sk < iter->end_sk)
3418                 sock_put(iter->batch[iter->cur_sk++]);
3419 }
3420
3421 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3422                                        unsigned int new_batch_sz)
3423 {
3424         struct sock **new_batch;
3425
3426         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3427                              GFP_USER | __GFP_NOWARN);
3428         if (!new_batch)
3429                 return -ENOMEM;
3430
3431         bpf_iter_unix_put_batch(iter);
3432         kvfree(iter->batch);
3433         iter->batch = new_batch;
3434         iter->max_sk = new_batch_sz;
3435
3436         return 0;
3437 }
3438
3439 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3440                                         loff_t *pos)
3441 {
3442         struct bpf_unix_iter_state *iter = seq->private;
3443         unsigned int expected;
3444         bool resized = false;
3445         struct sock *sk;
3446
3447         if (iter->st_bucket_done)
3448                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3449
3450 again:
3451         /* Get a new batch */
3452         iter->cur_sk = 0;
3453         iter->end_sk = 0;
3454
3455         sk = unix_get_first(seq, pos);
3456         if (!sk)
3457                 return NULL; /* Done */
3458
3459         expected = bpf_iter_unix_hold_batch(seq, sk);
3460
3461         if (iter->end_sk == expected) {
3462                 iter->st_bucket_done = true;
3463                 return sk;
3464         }
3465
3466         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3467                 resized = true;
3468                 goto again;
3469         }
3470
3471         return sk;
3472 }
3473
3474 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3475 {
3476         if (!*pos)
3477                 return SEQ_START_TOKEN;
3478
3479         /* bpf iter does not support lseek, so it always
3480          * continue from where it was stop()-ped.
3481          */
3482         return bpf_iter_unix_batch(seq, pos);
3483 }
3484
3485 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3486 {
3487         struct bpf_unix_iter_state *iter = seq->private;
3488         struct sock *sk;
3489
3490         /* Whenever seq_next() is called, the iter->cur_sk is
3491          * done with seq_show(), so advance to the next sk in
3492          * the batch.
3493          */
3494         if (iter->cur_sk < iter->end_sk)
3495                 sock_put(iter->batch[iter->cur_sk++]);
3496
3497         ++*pos;
3498
3499         if (iter->cur_sk < iter->end_sk)
3500                 sk = iter->batch[iter->cur_sk];
3501         else
3502                 sk = bpf_iter_unix_batch(seq, pos);
3503
3504         return sk;
3505 }
3506
3507 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3508 {
3509         struct bpf_iter_meta meta;
3510         struct bpf_prog *prog;
3511         struct sock *sk = v;
3512         uid_t uid;
3513         bool slow;
3514         int ret;
3515
3516         if (v == SEQ_START_TOKEN)
3517                 return 0;
3518
3519         slow = lock_sock_fast(sk);
3520
3521         if (unlikely(sk_unhashed(sk))) {
3522                 ret = SEQ_SKIP;
3523                 goto unlock;
3524         }
3525
3526         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3527         meta.seq = seq;
3528         prog = bpf_iter_get_info(&meta, false);
3529         ret = unix_prog_seq_show(prog, &meta, v, uid);
3530 unlock:
3531         unlock_sock_fast(sk, slow);
3532         return ret;
3533 }
3534
3535 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3536 {
3537         struct bpf_unix_iter_state *iter = seq->private;
3538         struct bpf_iter_meta meta;
3539         struct bpf_prog *prog;
3540
3541         if (!v) {
3542                 meta.seq = seq;
3543                 prog = bpf_iter_get_info(&meta, true);
3544                 if (prog)
3545                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3546         }
3547
3548         if (iter->cur_sk < iter->end_sk)
3549                 bpf_iter_unix_put_batch(iter);
3550 }
3551
3552 static const struct seq_operations bpf_iter_unix_seq_ops = {
3553         .start  = bpf_iter_unix_seq_start,
3554         .next   = bpf_iter_unix_seq_next,
3555         .stop   = bpf_iter_unix_seq_stop,
3556         .show   = bpf_iter_unix_seq_show,
3557 };
3558 #endif
3559 #endif
3560
3561 static const struct net_proto_family unix_family_ops = {
3562         .family = PF_UNIX,
3563         .create = unix_create,
3564         .owner  = THIS_MODULE,
3565 };
3566
3567
3568 static int __net_init unix_net_init(struct net *net)
3569 {
3570         int i;
3571
3572         net->unx.sysctl_max_dgram_qlen = 10;
3573         if (unix_sysctl_register(net))
3574                 goto out;
3575
3576 #ifdef CONFIG_PROC_FS
3577         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3578                              sizeof(struct seq_net_private)))
3579                 goto err_sysctl;
3580 #endif
3581
3582         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3583                                               sizeof(spinlock_t), GFP_KERNEL);
3584         if (!net->unx.table.locks)
3585                 goto err_proc;
3586
3587         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3588                                                 sizeof(struct hlist_head),
3589                                                 GFP_KERNEL);
3590         if (!net->unx.table.buckets)
3591                 goto free_locks;
3592
3593         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3594                 spin_lock_init(&net->unx.table.locks[i]);
3595                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3596         }
3597
3598         return 0;
3599
3600 free_locks:
3601         kvfree(net->unx.table.locks);
3602 err_proc:
3603 #ifdef CONFIG_PROC_FS
3604         remove_proc_entry("unix", net->proc_net);
3605 err_sysctl:
3606 #endif
3607         unix_sysctl_unregister(net);
3608 out:
3609         return -ENOMEM;
3610 }
3611
3612 static void __net_exit unix_net_exit(struct net *net)
3613 {
3614         kvfree(net->unx.table.buckets);
3615         kvfree(net->unx.table.locks);
3616         unix_sysctl_unregister(net);
3617         remove_proc_entry("unix", net->proc_net);
3618 }
3619
3620 static struct pernet_operations unix_net_ops = {
3621         .init = unix_net_init,
3622         .exit = unix_net_exit,
3623 };
3624
3625 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3626 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3627                      struct unix_sock *unix_sk, uid_t uid)
3628
3629 #define INIT_BATCH_SZ 16
3630
3631 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3632 {
3633         struct bpf_unix_iter_state *iter = priv_data;
3634         int err;
3635
3636         err = bpf_iter_init_seq_net(priv_data, aux);
3637         if (err)
3638                 return err;
3639
3640         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3641         if (err) {
3642                 bpf_iter_fini_seq_net(priv_data);
3643                 return err;
3644         }
3645
3646         return 0;
3647 }
3648
3649 static void bpf_iter_fini_unix(void *priv_data)
3650 {
3651         struct bpf_unix_iter_state *iter = priv_data;
3652
3653         bpf_iter_fini_seq_net(priv_data);
3654         kvfree(iter->batch);
3655 }
3656
3657 static const struct bpf_iter_seq_info unix_seq_info = {
3658         .seq_ops                = &bpf_iter_unix_seq_ops,
3659         .init_seq_private       = bpf_iter_init_unix,
3660         .fini_seq_private       = bpf_iter_fini_unix,
3661         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3662 };
3663
3664 static const struct bpf_func_proto *
3665 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3666                              const struct bpf_prog *prog)
3667 {
3668         switch (func_id) {
3669         case BPF_FUNC_setsockopt:
3670                 return &bpf_sk_setsockopt_proto;
3671         case BPF_FUNC_getsockopt:
3672                 return &bpf_sk_getsockopt_proto;
3673         default:
3674                 return NULL;
3675         }
3676 }
3677
3678 static struct bpf_iter_reg unix_reg_info = {
3679         .target                 = "unix",
3680         .ctx_arg_info_size      = 1,
3681         .ctx_arg_info           = {
3682                 { offsetof(struct bpf_iter__unix, unix_sk),
3683                   PTR_TO_BTF_ID_OR_NULL },
3684         },
3685         .get_func_proto         = bpf_iter_unix_get_func_proto,
3686         .seq_info               = &unix_seq_info,
3687 };
3688
3689 static void __init bpf_iter_register(void)
3690 {
3691         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3692         if (bpf_iter_reg_target(&unix_reg_info))
3693                 pr_warn("Warning: could not register bpf iterator unix\n");
3694 }
3695 #endif
3696
3697 static int __init af_unix_init(void)
3698 {
3699         int i, rc = -1;
3700
3701         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3702
3703         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3704                 spin_lock_init(&bsd_socket_locks[i]);
3705                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3706         }
3707
3708         rc = proto_register(&unix_dgram_proto, 1);
3709         if (rc != 0) {
3710                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3711                 goto out;
3712         }
3713
3714         rc = proto_register(&unix_stream_proto, 1);
3715         if (rc != 0) {
3716                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3717                 proto_unregister(&unix_dgram_proto);
3718                 goto out;
3719         }
3720
3721         sock_register(&unix_family_ops);
3722         register_pernet_subsys(&unix_net_ops);
3723         unix_bpf_build_proto();
3724
3725 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3726         bpf_iter_register();
3727 #endif
3728
3729 out:
3730         return rc;
3731 }
3732
3733 /* Later than subsys_initcall() because we depend on stuff initialised there */
3734 fs_initcall(af_unix_init);
This page took 0.245679 seconds and 4 git commands to generate.