]> Git Repo - J-linux.git/blob - net/unix/af_unix.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
[J-linux.git] / net / unix / af_unix.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:        Implementation of BSD Unix domain sockets.
4  *
5  * Authors:     Alan Cox, <[email protected]>
6  *
7  * Fixes:
8  *              Linus Torvalds  :       Assorted bug cures.
9  *              Niibe Yutaka    :       async I/O support.
10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
11  *              Alan Cox        :       Limit size of allocated blocks.
12  *              Alan Cox        :       Fixed the stupid socketpair bug.
13  *              Alan Cox        :       BSD compatibility fine tuning.
14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
15  *              Alan Cox        :       Sorted out a proper draft version of
16  *                                      file descriptor passing hacked up from
17  *                                      Mike Shaver's work.
18  *              Marty Leisner   :       Fixes to fd passing
19  *              Nick Nevin      :       recvmsg bugfix.
20  *              Alan Cox        :       Started proper garbage collector
21  *              Heiko EiBfeldt  :       Missing verify_area check
22  *              Alan Cox        :       Started POSIXisms
23  *              Andreas Schwab  :       Replace inode by dentry for proper
24  *                                      reference counting
25  *              Kirk Petersen   :       Made this a module
26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
27  *                                      Lots of bug fixes.
28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
29  *                                      by above two patches.
30  *           Andrea Arcangeli   :       If possible we block in connect(2)
31  *                                      if the max backlog of the listen socket
32  *                                      is been reached. This won't break
33  *                                      old apps and it will avoid huge amount
34  *                                      of socks hashed (this for unix_gc()
35  *                                      performances reasons).
36  *                                      Security fix that limits the max
37  *                                      number of socks to 2*max_files and
38  *                                      the number of skb queueable in the
39  *                                      dgram receiver.
40  *              Artur Skawina   :       Hash function optimizations
41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
42  *            Malcolm Beattie   :       Set peercred for socketpair
43  *           Michal Ostrowski   :       Module initialization cleanup.
44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
45  *                                      the core infrastructure is doing that
46  *                                      for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *      [TO FIX]
51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
52  *              other the moment one end closes.
53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *      [NOT TO FIX]
56  *      accept() returns a path name even if the connecting socket has closed
57  *              in the meantime (BSD loses the path and gives up).
58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *      BSD af_unix apparently has connect forgetting to block properly.
62  *              (need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *      Bug fixes and improvements.
66  *              - client shutdown killed server socket.
67  *              - removed all useless cli/sti pairs.
68  *
69  *      Semantic changes/extensions.
70  *              - generic control message passing.
71  *              - SCM_CREDENTIALS control message.
72  *              - "Abstract" (not FS based) socket bindings.
73  *                Abstract names are sequences of bytes (not zero terminated)
74  *                started by 0, so that this name space does not intersect
75  *                with BSD names.
76  */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132         unsigned long hash = (unsigned long)sk;
133
134         hash ^= hash >> 16;
135         hash ^= hash >> 8;
136         hash ^= sk->sk_type;
137
138         return hash & UNIX_HASH_MOD;
139 }
140
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143         return i->i_ino & UNIX_HASH_MOD;
144 }
145
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147                                        int addr_len, int type)
148 {
149         __wsum csum = csum_partial(sunaddr, addr_len, 0);
150         unsigned int hash;
151
152         hash = (__force unsigned int)csum_fold(csum);
153         hash ^= hash >> 8;
154         hash ^= type;
155
156         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158
159 static void unix_table_double_lock(struct net *net,
160                                    unsigned int hash1, unsigned int hash2)
161 {
162         if (hash1 == hash2) {
163                 spin_lock(&net->unx.table.locks[hash1]);
164                 return;
165         }
166
167         if (hash1 > hash2)
168                 swap(hash1, hash2);
169
170         spin_lock(&net->unx.table.locks[hash1]);
171         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173
174 static void unix_table_double_unlock(struct net *net,
175                                      unsigned int hash1, unsigned int hash2)
176 {
177         if (hash1 == hash2) {
178                 spin_unlock(&net->unx.table.locks[hash1]);
179                 return;
180         }
181
182         spin_unlock(&net->unx.table.locks[hash1]);
183         spin_unlock(&net->unx.table.locks[hash2]);
184 }
185
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189         UNIXCB(skb).secid = scm->secid;
190 }
191
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194         scm->secid = UNIXCB(skb).secid;
195 }
196
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199         return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210         return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216         return unix_peer(osk) == sk;
217 }
218
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231         return skb_queue_len_lockless(&sk->sk_receive_queue) >
232                 READ_ONCE(sk->sk_max_ack_backlog);
233 }
234
235 struct sock *unix_peer_get(struct sock *s)
236 {
237         struct sock *peer;
238
239         unix_state_lock(s);
240         peer = unix_peer(s);
241         if (peer)
242                 sock_hold(peer);
243         unix_state_unlock(s);
244         return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249                                              int addr_len)
250 {
251         struct unix_address *addr;
252
253         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254         if (!addr)
255                 return NULL;
256
257         refcount_set(&addr->refcnt, 1);
258         addr->len = addr_len;
259         memcpy(addr->name, sunaddr, addr_len);
260
261         return addr;
262 }
263
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266         if (refcount_dec_and_test(&addr->refcnt))
267                 kfree(addr);
268 }
269
270 /*
271  *      Check unix socket name:
272  *              - should be not zero length.
273  *              - if started by not zero, should be NULL terminated (FS object)
274  *              - if started by zero, it is abstract name.
275  */
276
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280             addr_len > sizeof(*sunaddr))
281                 return -EINVAL;
282
283         if (sunaddr->sun_family != AF_UNIX)
284                 return -EINVAL;
285
286         return 0;
287 }
288
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292         short offset = offsetof(struct sockaddr_storage, __data);
293
294         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295
296         /* This may look like an off by one error but it is a bit more
297          * subtle.  108 is the longest valid AF_UNIX path for a binding.
298          * sun_path[108] doesn't as such exist.  However in kernel space
299          * we are guaranteed that it is a valid memory location in our
300          * kernel address buffer because syscall functions always pass
301          * a pointer of struct sockaddr_storage which has a bigger buffer
302          * than 108.  Also, we must terminate sun_path for strlen() in
303          * getname_kernel().
304          */
305         addr->__data[addr_len - offset] = 0;
306
307         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309          * know the actual buffer.
310          */
311         return strlen(addr->__data) + offset + 1;
312 }
313
314 static void __unix_remove_socket(struct sock *sk)
315 {
316         sk_del_node_init(sk);
317 }
318
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326                                  struct unix_address *addr, unsigned int hash)
327 {
328         __unix_remove_socket(sk);
329         smp_store_release(&unix_sk(sk)->addr, addr);
330
331         sk->sk_hash = hash;
332         __unix_insert_socket(net, sk);
333 }
334
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337         spin_lock(&net->unx.table.locks[sk->sk_hash]);
338         __unix_remove_socket(sk);
339         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344         spin_lock(&net->unx.table.locks[sk->sk_hash]);
345         __unix_insert_socket(net, sk);
346         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351         spin_lock(&bsd_socket_locks[sk->sk_hash]);
352         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358         if (!hlist_unhashed(&sk->sk_bind_node)) {
359                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
360                 __sk_del_bind_node(sk);
361                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362
363                 sk_node_init(&sk->sk_bind_node);
364         }
365 }
366
367 static struct sock *__unix_find_socket_byname(struct net *net,
368                                               struct sockaddr_un *sunname,
369                                               int len, unsigned int hash)
370 {
371         struct sock *s;
372
373         sk_for_each(s, &net->unx.table.buckets[hash]) {
374                 struct unix_sock *u = unix_sk(s);
375
376                 if (u->addr->len == len &&
377                     !memcmp(u->addr->name, sunname, len))
378                         return s;
379         }
380         return NULL;
381 }
382
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384                                                    struct sockaddr_un *sunname,
385                                                    int len, unsigned int hash)
386 {
387         struct sock *s;
388
389         spin_lock(&net->unx.table.locks[hash]);
390         s = __unix_find_socket_byname(net, sunname, len, hash);
391         if (s)
392                 sock_hold(s);
393         spin_unlock(&net->unx.table.locks[hash]);
394         return s;
395 }
396
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399         unsigned int hash = unix_bsd_hash(i);
400         struct sock *s;
401
402         spin_lock(&bsd_socket_locks[hash]);
403         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404                 struct dentry *dentry = unix_sk(s)->path.dentry;
405
406                 if (dentry && d_backing_inode(dentry) == i) {
407                         sock_hold(s);
408                         spin_unlock(&bsd_socket_locks[hash]);
409                         return s;
410                 }
411         }
412         spin_unlock(&bsd_socket_locks[hash]);
413         return NULL;
414 }
415
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442                                       void *key)
443 {
444         struct unix_sock *u;
445         wait_queue_head_t *u_sleep;
446
447         u = container_of(q, struct unix_sock, peer_wake);
448
449         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450                             q);
451         u->peer_wake.private = NULL;
452
453         /* relaying can only happen while the wq still exists */
454         u_sleep = sk_sleep(&u->sk);
455         if (u_sleep)
456                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457
458         return 0;
459 }
460
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463         struct unix_sock *u, *u_other;
464         int rc;
465
466         u = unix_sk(sk);
467         u_other = unix_sk(other);
468         rc = 0;
469         spin_lock(&u_other->peer_wait.lock);
470
471         if (!u->peer_wake.private) {
472                 u->peer_wake.private = other;
473                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474
475                 rc = 1;
476         }
477
478         spin_unlock(&u_other->peer_wait.lock);
479         return rc;
480 }
481
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483                                             struct sock *other)
484 {
485         struct unix_sock *u, *u_other;
486
487         u = unix_sk(sk);
488         u_other = unix_sk(other);
489         spin_lock(&u_other->peer_wait.lock);
490
491         if (u->peer_wake.private == other) {
492                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493                 u->peer_wake.private = NULL;
494         }
495
496         spin_unlock(&u_other->peer_wait.lock);
497 }
498
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500                                                    struct sock *other)
501 {
502         unix_dgram_peer_wake_disconnect(sk, other);
503         wake_up_interruptible_poll(sk_sleep(sk),
504                                    EPOLLOUT |
505                                    EPOLLWRNORM |
506                                    EPOLLWRBAND);
507 }
508
509 /* preconditions:
510  *      - unix_peer(sk) == other
511  *      - association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515         int connected;
516
517         connected = unix_dgram_peer_wake_connect(sk, other);
518
519         /* If other is SOCK_DEAD, we want to make sure we signal
520          * POLLOUT, such that a subsequent write() can get a
521          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522          * to other and its full, we will hang waiting for POLLOUT.
523          */
524         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525                 return 1;
526
527         if (connected)
528                 unix_dgram_peer_wake_disconnect(sk, other);
529
530         return 0;
531 }
532
533 static int unix_writable(const struct sock *sk)
534 {
535         return sk->sk_state != TCP_LISTEN &&
536                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538
539 static void unix_write_space(struct sock *sk)
540 {
541         struct socket_wq *wq;
542
543         rcu_read_lock();
544         if (unix_writable(sk)) {
545                 wq = rcu_dereference(sk->sk_wq);
546                 if (skwq_has_sleeper(wq))
547                         wake_up_interruptible_sync_poll(&wq->wait,
548                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
550         }
551         rcu_read_unlock();
552 }
553
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560         if (!skb_queue_empty(&sk->sk_receive_queue)) {
561                 skb_queue_purge(&sk->sk_receive_queue);
562                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563
564                 /* If one link of bidirectional dgram pipe is disconnected,
565                  * we signal error. Messages are lost. Do not make this,
566                  * when peer was not connected to us.
567                  */
568                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569                         WRITE_ONCE(other->sk_err, ECONNRESET);
570                         sk_error_report(other);
571                 }
572         }
573         other->sk_state = TCP_CLOSE;
574 }
575
576 static void unix_sock_destructor(struct sock *sk)
577 {
578         struct unix_sock *u = unix_sk(sk);
579
580         skb_queue_purge(&sk->sk_receive_queue);
581
582         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585         if (!sock_flag(sk, SOCK_DEAD)) {
586                 pr_info("Attempt to release alive unix socket: %p\n", sk);
587                 return;
588         }
589
590         if (u->addr)
591                 unix_release_addr(u->addr);
592
593         atomic_long_dec(&unix_nr_socks);
594         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
595 #ifdef UNIX_REFCNT_DEBUG
596         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597                 atomic_long_read(&unix_nr_socks));
598 #endif
599 }
600
601 static void unix_release_sock(struct sock *sk, int embrion)
602 {
603         struct unix_sock *u = unix_sk(sk);
604         struct sock *skpair;
605         struct sk_buff *skb;
606         struct path path;
607         int state;
608
609         unix_remove_socket(sock_net(sk), sk);
610         unix_remove_bsd_socket(sk);
611
612         /* Clear state */
613         unix_state_lock(sk);
614         sock_orphan(sk);
615         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616         path         = u->path;
617         u->path.dentry = NULL;
618         u->path.mnt = NULL;
619         state = sk->sk_state;
620         sk->sk_state = TCP_CLOSE;
621
622         skpair = unix_peer(sk);
623         unix_peer(sk) = NULL;
624
625         unix_state_unlock(sk);
626
627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628         if (u->oob_skb) {
629                 kfree_skb(u->oob_skb);
630                 u->oob_skb = NULL;
631         }
632 #endif
633
634         wake_up_interruptible_all(&u->peer_wait);
635
636         if (skpair != NULL) {
637                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
638                         unix_state_lock(skpair);
639                         /* No more writes */
640                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
642                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
643                         unix_state_unlock(skpair);
644                         skpair->sk_state_change(skpair);
645                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
646                 }
647
648                 unix_dgram_peer_wake_disconnect(sk, skpair);
649                 sock_put(skpair); /* It may now die */
650         }
651
652         /* Try to flush out this socket. Throw out buffers at least */
653
654         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
655                 if (state == TCP_LISTEN)
656                         unix_release_sock(skb->sk, 1);
657                 /* passed fds are erased in the kfree_skb hook        */
658                 UNIXCB(skb).consumed = skb->len;
659                 kfree_skb(skb);
660         }
661
662         if (path.dentry)
663                 path_put(&path);
664
665         sock_put(sk);
666
667         /* ---- Socket is dead now and most probably destroyed ---- */
668
669         /*
670          * Fixme: BSD difference: In BSD all sockets connected to us get
671          *        ECONNRESET and we die on the spot. In Linux we behave
672          *        like files and pipes do and wait for the last
673          *        dereference.
674          *
675          * Can't we simply set sock->err?
676          *
677          *        What the above comment does talk about? --ANK(980817)
678          */
679
680         if (READ_ONCE(unix_tot_inflight))
681                 unix_gc();              /* Garbage collect fds */
682 }
683
684 static void init_peercred(struct sock *sk)
685 {
686         const struct cred *old_cred;
687         struct pid *old_pid;
688
689         spin_lock(&sk->sk_peer_lock);
690         old_pid = sk->sk_peer_pid;
691         old_cred = sk->sk_peer_cred;
692         sk->sk_peer_pid  = get_pid(task_tgid(current));
693         sk->sk_peer_cred = get_current_cred();
694         spin_unlock(&sk->sk_peer_lock);
695
696         put_pid(old_pid);
697         put_cred(old_cred);
698 }
699
700 static void copy_peercred(struct sock *sk, struct sock *peersk)
701 {
702         const struct cred *old_cred;
703         struct pid *old_pid;
704
705         if (sk < peersk) {
706                 spin_lock(&sk->sk_peer_lock);
707                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708         } else {
709                 spin_lock(&peersk->sk_peer_lock);
710                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711         }
712         old_pid = sk->sk_peer_pid;
713         old_cred = sk->sk_peer_cred;
714         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
715         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
716
717         spin_unlock(&sk->sk_peer_lock);
718         spin_unlock(&peersk->sk_peer_lock);
719
720         put_pid(old_pid);
721         put_cred(old_cred);
722 }
723
724 static int unix_listen(struct socket *sock, int backlog)
725 {
726         int err;
727         struct sock *sk = sock->sk;
728         struct unix_sock *u = unix_sk(sk);
729
730         err = -EOPNOTSUPP;
731         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732                 goto out;       /* Only stream/seqpacket sockets accept */
733         err = -EINVAL;
734         if (!u->addr)
735                 goto out;       /* No listens on an unbound socket */
736         unix_state_lock(sk);
737         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738                 goto out_unlock;
739         if (backlog > sk->sk_max_ack_backlog)
740                 wake_up_interruptible_all(&u->peer_wait);
741         sk->sk_max_ack_backlog  = backlog;
742         sk->sk_state            = TCP_LISTEN;
743         /* set credentials so connect can copy them */
744         init_peercred(sk);
745         err = 0;
746
747 out_unlock:
748         unix_state_unlock(sk);
749 out:
750         return err;
751 }
752
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756                                int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, int, bool);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762                                     poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771                                        struct pipe_inode_info *, size_t size,
772                                        unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778                               int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781                                   int);
782
783 #ifdef CONFIG_PROC_FS
784 static int unix_count_nr_fds(struct sock *sk)
785 {
786         struct sk_buff *skb;
787         struct unix_sock *u;
788         int nr_fds = 0;
789
790         spin_lock(&sk->sk_receive_queue.lock);
791         skb = skb_peek(&sk->sk_receive_queue);
792         while (skb) {
793                 u = unix_sk(skb->sk);
794                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
795                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
796         }
797         spin_unlock(&sk->sk_receive_queue.lock);
798
799         return nr_fds;
800 }
801
802 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
803 {
804         struct sock *sk = sock->sk;
805         unsigned char s_state;
806         struct unix_sock *u;
807         int nr_fds = 0;
808
809         if (sk) {
810                 s_state = READ_ONCE(sk->sk_state);
811                 u = unix_sk(sk);
812
813                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
814                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815                  * SOCK_DGRAM is ordinary. So, no lock is needed.
816                  */
817                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
818                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
819                 else if (s_state == TCP_LISTEN)
820                         nr_fds = unix_count_nr_fds(sk);
821
822                 seq_printf(m, "scm_fds: %u\n", nr_fds);
823         }
824 }
825 #else
826 #define unix_show_fdinfo NULL
827 #endif
828
829 static const struct proto_ops unix_stream_ops = {
830         .family =       PF_UNIX,
831         .owner =        THIS_MODULE,
832         .release =      unix_release,
833         .bind =         unix_bind,
834         .connect =      unix_stream_connect,
835         .socketpair =   unix_socketpair,
836         .accept =       unix_accept,
837         .getname =      unix_getname,
838         .poll =         unix_poll,
839         .ioctl =        unix_ioctl,
840 #ifdef CONFIG_COMPAT
841         .compat_ioctl = unix_compat_ioctl,
842 #endif
843         .listen =       unix_listen,
844         .shutdown =     unix_shutdown,
845         .sendmsg =      unix_stream_sendmsg,
846         .recvmsg =      unix_stream_recvmsg,
847         .read_skb =     unix_stream_read_skb,
848         .mmap =         sock_no_mmap,
849         .splice_read =  unix_stream_splice_read,
850         .set_peek_off = sk_set_peek_off,
851         .show_fdinfo =  unix_show_fdinfo,
852 };
853
854 static const struct proto_ops unix_dgram_ops = {
855         .family =       PF_UNIX,
856         .owner =        THIS_MODULE,
857         .release =      unix_release,
858         .bind =         unix_bind,
859         .connect =      unix_dgram_connect,
860         .socketpair =   unix_socketpair,
861         .accept =       sock_no_accept,
862         .getname =      unix_getname,
863         .poll =         unix_dgram_poll,
864         .ioctl =        unix_ioctl,
865 #ifdef CONFIG_COMPAT
866         .compat_ioctl = unix_compat_ioctl,
867 #endif
868         .listen =       sock_no_listen,
869         .shutdown =     unix_shutdown,
870         .sendmsg =      unix_dgram_sendmsg,
871         .read_skb =     unix_read_skb,
872         .recvmsg =      unix_dgram_recvmsg,
873         .mmap =         sock_no_mmap,
874         .set_peek_off = sk_set_peek_off,
875         .show_fdinfo =  unix_show_fdinfo,
876 };
877
878 static const struct proto_ops unix_seqpacket_ops = {
879         .family =       PF_UNIX,
880         .owner =        THIS_MODULE,
881         .release =      unix_release,
882         .bind =         unix_bind,
883         .connect =      unix_stream_connect,
884         .socketpair =   unix_socketpair,
885         .accept =       unix_accept,
886         .getname =      unix_getname,
887         .poll =         unix_dgram_poll,
888         .ioctl =        unix_ioctl,
889 #ifdef CONFIG_COMPAT
890         .compat_ioctl = unix_compat_ioctl,
891 #endif
892         .listen =       unix_listen,
893         .shutdown =     unix_shutdown,
894         .sendmsg =      unix_seqpacket_sendmsg,
895         .recvmsg =      unix_seqpacket_recvmsg,
896         .mmap =         sock_no_mmap,
897         .set_peek_off = sk_set_peek_off,
898         .show_fdinfo =  unix_show_fdinfo,
899 };
900
901 static void unix_close(struct sock *sk, long timeout)
902 {
903         /* Nothing to do here, unix socket does not need a ->close().
904          * This is merely for sockmap.
905          */
906 }
907
908 static void unix_unhash(struct sock *sk)
909 {
910         /* Nothing to do here, unix socket does not need a ->unhash().
911          * This is merely for sockmap.
912          */
913 }
914
915 static bool unix_bpf_bypass_getsockopt(int level, int optname)
916 {
917         if (level == SOL_SOCKET) {
918                 switch (optname) {
919                 case SO_PEERPIDFD:
920                         return true;
921                 default:
922                         return false;
923                 }
924         }
925
926         return false;
927 }
928
929 struct proto unix_dgram_proto = {
930         .name                   = "UNIX",
931         .owner                  = THIS_MODULE,
932         .obj_size               = sizeof(struct unix_sock),
933         .close                  = unix_close,
934         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
935 #ifdef CONFIG_BPF_SYSCALL
936         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
937 #endif
938 };
939
940 struct proto unix_stream_proto = {
941         .name                   = "UNIX-STREAM",
942         .owner                  = THIS_MODULE,
943         .obj_size               = sizeof(struct unix_sock),
944         .close                  = unix_close,
945         .unhash                 = unix_unhash,
946         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
947 #ifdef CONFIG_BPF_SYSCALL
948         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
949 #endif
950 };
951
952 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
953 {
954         struct unix_sock *u;
955         struct sock *sk;
956         int err;
957
958         atomic_long_inc(&unix_nr_socks);
959         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
960                 err = -ENFILE;
961                 goto err;
962         }
963
964         if (type == SOCK_STREAM)
965                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
966         else /*dgram and  seqpacket */
967                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
968
969         if (!sk) {
970                 err = -ENOMEM;
971                 goto err;
972         }
973
974         sock_init_data(sock, sk);
975
976         sk->sk_hash             = unix_unbound_hash(sk);
977         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
978         sk->sk_write_space      = unix_write_space;
979         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
980         sk->sk_destruct         = unix_sock_destructor;
981         u = unix_sk(sk);
982         u->listener = NULL;
983         u->vertex = NULL;
984         u->path.dentry = NULL;
985         u->path.mnt = NULL;
986         spin_lock_init(&u->lock);
987         mutex_init(&u->iolock); /* single task reading lock */
988         mutex_init(&u->bindlock); /* single task binding lock */
989         init_waitqueue_head(&u->peer_wait);
990         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
991         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
992         unix_insert_unbound_socket(net, sk);
993
994         sock_prot_inuse_add(net, sk->sk_prot, 1);
995
996         return sk;
997
998 err:
999         atomic_long_dec(&unix_nr_socks);
1000         return ERR_PTR(err);
1001 }
1002
1003 static int unix_create(struct net *net, struct socket *sock, int protocol,
1004                        int kern)
1005 {
1006         struct sock *sk;
1007
1008         if (protocol && protocol != PF_UNIX)
1009                 return -EPROTONOSUPPORT;
1010
1011         sock->state = SS_UNCONNECTED;
1012
1013         switch (sock->type) {
1014         case SOCK_STREAM:
1015                 sock->ops = &unix_stream_ops;
1016                 break;
1017                 /*
1018                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019                  *      nothing uses it.
1020                  */
1021         case SOCK_RAW:
1022                 sock->type = SOCK_DGRAM;
1023                 fallthrough;
1024         case SOCK_DGRAM:
1025                 sock->ops = &unix_dgram_ops;
1026                 break;
1027         case SOCK_SEQPACKET:
1028                 sock->ops = &unix_seqpacket_ops;
1029                 break;
1030         default:
1031                 return -ESOCKTNOSUPPORT;
1032         }
1033
1034         sk = unix_create1(net, sock, kern, sock->type);
1035         if (IS_ERR(sk))
1036                 return PTR_ERR(sk);
1037
1038         return 0;
1039 }
1040
1041 static int unix_release(struct socket *sock)
1042 {
1043         struct sock *sk = sock->sk;
1044
1045         if (!sk)
1046                 return 0;
1047
1048         sk->sk_prot->close(sk, 0);
1049         unix_release_sock(sk, 0);
1050         sock->sk = NULL;
1051
1052         return 0;
1053 }
1054
1055 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1056                                   int type)
1057 {
1058         struct inode *inode;
1059         struct path path;
1060         struct sock *sk;
1061         int err;
1062
1063         unix_mkname_bsd(sunaddr, addr_len);
1064         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065         if (err)
1066                 goto fail;
1067
1068         err = path_permission(&path, MAY_WRITE);
1069         if (err)
1070                 goto path_put;
1071
1072         err = -ECONNREFUSED;
1073         inode = d_backing_inode(path.dentry);
1074         if (!S_ISSOCK(inode->i_mode))
1075                 goto path_put;
1076
1077         sk = unix_find_socket_byinode(inode);
1078         if (!sk)
1079                 goto path_put;
1080
1081         err = -EPROTOTYPE;
1082         if (sk->sk_type == type)
1083                 touch_atime(&path);
1084         else
1085                 goto sock_put;
1086
1087         path_put(&path);
1088
1089         return sk;
1090
1091 sock_put:
1092         sock_put(sk);
1093 path_put:
1094         path_put(&path);
1095 fail:
1096         return ERR_PTR(err);
1097 }
1098
1099 static struct sock *unix_find_abstract(struct net *net,
1100                                        struct sockaddr_un *sunaddr,
1101                                        int addr_len, int type)
1102 {
1103         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104         struct dentry *dentry;
1105         struct sock *sk;
1106
1107         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1108         if (!sk)
1109                 return ERR_PTR(-ECONNREFUSED);
1110
1111         dentry = unix_sk(sk)->path.dentry;
1112         if (dentry)
1113                 touch_atime(&unix_sk(sk)->path);
1114
1115         return sk;
1116 }
1117
1118 static struct sock *unix_find_other(struct net *net,
1119                                     struct sockaddr_un *sunaddr,
1120                                     int addr_len, int type)
1121 {
1122         struct sock *sk;
1123
1124         if (sunaddr->sun_path[0])
1125                 sk = unix_find_bsd(sunaddr, addr_len, type);
1126         else
1127                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128
1129         return sk;
1130 }
1131
1132 static int unix_autobind(struct sock *sk)
1133 {
1134         unsigned int new_hash, old_hash = sk->sk_hash;
1135         struct unix_sock *u = unix_sk(sk);
1136         struct net *net = sock_net(sk);
1137         struct unix_address *addr;
1138         u32 lastnum, ordernum;
1139         int err;
1140
1141         err = mutex_lock_interruptible(&u->bindlock);
1142         if (err)
1143                 return err;
1144
1145         if (u->addr)
1146                 goto out;
1147
1148         err = -ENOMEM;
1149         addr = kzalloc(sizeof(*addr) +
1150                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1151         if (!addr)
1152                 goto out;
1153
1154         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1155         addr->name->sun_family = AF_UNIX;
1156         refcount_set(&addr->refcnt, 1);
1157
1158         ordernum = get_random_u32();
1159         lastnum = ordernum & 0xFFFFF;
1160 retry:
1161         ordernum = (ordernum + 1) & 0xFFFFF;
1162         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1163
1164         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1165         unix_table_double_lock(net, old_hash, new_hash);
1166
1167         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1168                 unix_table_double_unlock(net, old_hash, new_hash);
1169
1170                 /* __unix_find_socket_byname() may take long time if many names
1171                  * are already in use.
1172                  */
1173                 cond_resched();
1174
1175                 if (ordernum == lastnum) {
1176                         /* Give up if all names seems to be in use. */
1177                         err = -ENOSPC;
1178                         unix_release_addr(addr);
1179                         goto out;
1180                 }
1181
1182                 goto retry;
1183         }
1184
1185         __unix_set_addr_hash(net, sk, addr, new_hash);
1186         unix_table_double_unlock(net, old_hash, new_hash);
1187         err = 0;
1188
1189 out:    mutex_unlock(&u->bindlock);
1190         return err;
1191 }
1192
1193 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1194                          int addr_len)
1195 {
1196         umode_t mode = S_IFSOCK |
1197                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1198         unsigned int new_hash, old_hash = sk->sk_hash;
1199         struct unix_sock *u = unix_sk(sk);
1200         struct net *net = sock_net(sk);
1201         struct mnt_idmap *idmap;
1202         struct unix_address *addr;
1203         struct dentry *dentry;
1204         struct path parent;
1205         int err;
1206
1207         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1208         addr = unix_create_addr(sunaddr, addr_len);
1209         if (!addr)
1210                 return -ENOMEM;
1211
1212         /*
1213          * Get the parent directory, calculate the hash for last
1214          * component.
1215          */
1216         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1217         if (IS_ERR(dentry)) {
1218                 err = PTR_ERR(dentry);
1219                 goto out;
1220         }
1221
1222         /*
1223          * All right, let's create it.
1224          */
1225         idmap = mnt_idmap(parent.mnt);
1226         err = security_path_mknod(&parent, dentry, mode, 0);
1227         if (!err)
1228                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1229         if (err)
1230                 goto out_path;
1231         err = mutex_lock_interruptible(&u->bindlock);
1232         if (err)
1233                 goto out_unlink;
1234         if (u->addr)
1235                 goto out_unlock;
1236
1237         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1238         unix_table_double_lock(net, old_hash, new_hash);
1239         u->path.mnt = mntget(parent.mnt);
1240         u->path.dentry = dget(dentry);
1241         __unix_set_addr_hash(net, sk, addr, new_hash);
1242         unix_table_double_unlock(net, old_hash, new_hash);
1243         unix_insert_bsd_socket(sk);
1244         mutex_unlock(&u->bindlock);
1245         done_path_create(&parent, dentry);
1246         return 0;
1247
1248 out_unlock:
1249         mutex_unlock(&u->bindlock);
1250         err = -EINVAL;
1251 out_unlink:
1252         /* failed after successful mknod?  unlink what we'd created... */
1253         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1254 out_path:
1255         done_path_create(&parent, dentry);
1256 out:
1257         unix_release_addr(addr);
1258         return err == -EEXIST ? -EADDRINUSE : err;
1259 }
1260
1261 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1262                               int addr_len)
1263 {
1264         unsigned int new_hash, old_hash = sk->sk_hash;
1265         struct unix_sock *u = unix_sk(sk);
1266         struct net *net = sock_net(sk);
1267         struct unix_address *addr;
1268         int err;
1269
1270         addr = unix_create_addr(sunaddr, addr_len);
1271         if (!addr)
1272                 return -ENOMEM;
1273
1274         err = mutex_lock_interruptible(&u->bindlock);
1275         if (err)
1276                 goto out;
1277
1278         if (u->addr) {
1279                 err = -EINVAL;
1280                 goto out_mutex;
1281         }
1282
1283         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1284         unix_table_double_lock(net, old_hash, new_hash);
1285
1286         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1287                 goto out_spin;
1288
1289         __unix_set_addr_hash(net, sk, addr, new_hash);
1290         unix_table_double_unlock(net, old_hash, new_hash);
1291         mutex_unlock(&u->bindlock);
1292         return 0;
1293
1294 out_spin:
1295         unix_table_double_unlock(net, old_hash, new_hash);
1296         err = -EADDRINUSE;
1297 out_mutex:
1298         mutex_unlock(&u->bindlock);
1299 out:
1300         unix_release_addr(addr);
1301         return err;
1302 }
1303
1304 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1305 {
1306         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1307         struct sock *sk = sock->sk;
1308         int err;
1309
1310         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1311             sunaddr->sun_family == AF_UNIX)
1312                 return unix_autobind(sk);
1313
1314         err = unix_validate_addr(sunaddr, addr_len);
1315         if (err)
1316                 return err;
1317
1318         if (sunaddr->sun_path[0])
1319                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1320         else
1321                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1322
1323         return err;
1324 }
1325
1326 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1327 {
1328         if (unlikely(sk1 == sk2) || !sk2) {
1329                 unix_state_lock(sk1);
1330                 return;
1331         }
1332         if (sk1 > sk2)
1333                 swap(sk1, sk2);
1334
1335         unix_state_lock(sk1);
1336         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1337 }
1338
1339 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1340 {
1341         if (unlikely(sk1 == sk2) || !sk2) {
1342                 unix_state_unlock(sk1);
1343                 return;
1344         }
1345         unix_state_unlock(sk1);
1346         unix_state_unlock(sk2);
1347 }
1348
1349 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1350                               int alen, int flags)
1351 {
1352         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1353         struct sock *sk = sock->sk;
1354         struct sock *other;
1355         int err;
1356
1357         err = -EINVAL;
1358         if (alen < offsetofend(struct sockaddr, sa_family))
1359                 goto out;
1360
1361         if (addr->sa_family != AF_UNSPEC) {
1362                 err = unix_validate_addr(sunaddr, alen);
1363                 if (err)
1364                         goto out;
1365
1366                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1367                 if (err)
1368                         goto out;
1369
1370                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1371                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1372                     !unix_sk(sk)->addr) {
1373                         err = unix_autobind(sk);
1374                         if (err)
1375                                 goto out;
1376                 }
1377
1378 restart:
1379                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1380                 if (IS_ERR(other)) {
1381                         err = PTR_ERR(other);
1382                         goto out;
1383                 }
1384
1385                 unix_state_double_lock(sk, other);
1386
1387                 /* Apparently VFS overslept socket death. Retry. */
1388                 if (sock_flag(other, SOCK_DEAD)) {
1389                         unix_state_double_unlock(sk, other);
1390                         sock_put(other);
1391                         goto restart;
1392                 }
1393
1394                 err = -EPERM;
1395                 if (!unix_may_send(sk, other))
1396                         goto out_unlock;
1397
1398                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1399                 if (err)
1400                         goto out_unlock;
1401
1402                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1403         } else {
1404                 /*
1405                  *      1003.1g breaking connected state with AF_UNSPEC
1406                  */
1407                 other = NULL;
1408                 unix_state_double_lock(sk, other);
1409         }
1410
1411         /*
1412          * If it was connected, reconnect.
1413          */
1414         if (unix_peer(sk)) {
1415                 struct sock *old_peer = unix_peer(sk);
1416
1417                 unix_peer(sk) = other;
1418                 if (!other)
1419                         sk->sk_state = TCP_CLOSE;
1420                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1421
1422                 unix_state_double_unlock(sk, other);
1423
1424                 if (other != old_peer)
1425                         unix_dgram_disconnected(sk, old_peer);
1426                 sock_put(old_peer);
1427         } else {
1428                 unix_peer(sk) = other;
1429                 unix_state_double_unlock(sk, other);
1430         }
1431
1432         return 0;
1433
1434 out_unlock:
1435         unix_state_double_unlock(sk, other);
1436         sock_put(other);
1437 out:
1438         return err;
1439 }
1440
1441 static long unix_wait_for_peer(struct sock *other, long timeo)
1442         __releases(&unix_sk(other)->lock)
1443 {
1444         struct unix_sock *u = unix_sk(other);
1445         int sched;
1446         DEFINE_WAIT(wait);
1447
1448         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1449
1450         sched = !sock_flag(other, SOCK_DEAD) &&
1451                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1452                 unix_recvq_full_lockless(other);
1453
1454         unix_state_unlock(other);
1455
1456         if (sched)
1457                 timeo = schedule_timeout(timeo);
1458
1459         finish_wait(&u->peer_wait, &wait);
1460         return timeo;
1461 }
1462
1463 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1464                                int addr_len, int flags)
1465 {
1466         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1467         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1468         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1469         struct net *net = sock_net(sk);
1470         struct sk_buff *skb = NULL;
1471         long timeo;
1472         int err;
1473         int st;
1474
1475         err = unix_validate_addr(sunaddr, addr_len);
1476         if (err)
1477                 goto out;
1478
1479         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1480         if (err)
1481                 goto out;
1482
1483         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1484              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1485                 err = unix_autobind(sk);
1486                 if (err)
1487                         goto out;
1488         }
1489
1490         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1491
1492         /* First of all allocate resources.
1493            If we will make it after state is locked,
1494            we will have to recheck all again in any case.
1495          */
1496
1497         /* create new sock for complete connection */
1498         newsk = unix_create1(net, NULL, 0, sock->type);
1499         if (IS_ERR(newsk)) {
1500                 err = PTR_ERR(newsk);
1501                 newsk = NULL;
1502                 goto out;
1503         }
1504
1505         err = -ENOMEM;
1506
1507         /* Allocate skb for sending to listening sock */
1508         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1509         if (skb == NULL)
1510                 goto out;
1511
1512 restart:
1513         /*  Find listening sock. */
1514         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1515         if (IS_ERR(other)) {
1516                 err = PTR_ERR(other);
1517                 other = NULL;
1518                 goto out;
1519         }
1520
1521         /* Latch state of peer */
1522         unix_state_lock(other);
1523
1524         /* Apparently VFS overslept socket death. Retry. */
1525         if (sock_flag(other, SOCK_DEAD)) {
1526                 unix_state_unlock(other);
1527                 sock_put(other);
1528                 goto restart;
1529         }
1530
1531         err = -ECONNREFUSED;
1532         if (other->sk_state != TCP_LISTEN)
1533                 goto out_unlock;
1534         if (other->sk_shutdown & RCV_SHUTDOWN)
1535                 goto out_unlock;
1536
1537         if (unix_recvq_full(other)) {
1538                 err = -EAGAIN;
1539                 if (!timeo)
1540                         goto out_unlock;
1541
1542                 timeo = unix_wait_for_peer(other, timeo);
1543
1544                 err = sock_intr_errno(timeo);
1545                 if (signal_pending(current))
1546                         goto out;
1547                 sock_put(other);
1548                 goto restart;
1549         }
1550
1551         /* Latch our state.
1552
1553            It is tricky place. We need to grab our state lock and cannot
1554            drop lock on peer. It is dangerous because deadlock is
1555            possible. Connect to self case and simultaneous
1556            attempt to connect are eliminated by checking socket
1557            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1558            check this before attempt to grab lock.
1559
1560            Well, and we have to recheck the state after socket locked.
1561          */
1562         st = sk->sk_state;
1563
1564         switch (st) {
1565         case TCP_CLOSE:
1566                 /* This is ok... continue with connect */
1567                 break;
1568         case TCP_ESTABLISHED:
1569                 /* Socket is already connected */
1570                 err = -EISCONN;
1571                 goto out_unlock;
1572         default:
1573                 err = -EINVAL;
1574                 goto out_unlock;
1575         }
1576
1577         unix_state_lock_nested(sk, U_LOCK_SECOND);
1578
1579         if (sk->sk_state != st) {
1580                 unix_state_unlock(sk);
1581                 unix_state_unlock(other);
1582                 sock_put(other);
1583                 goto restart;
1584         }
1585
1586         err = security_unix_stream_connect(sk, other, newsk);
1587         if (err) {
1588                 unix_state_unlock(sk);
1589                 goto out_unlock;
1590         }
1591
1592         /* The way is open! Fastly set all the necessary fields... */
1593
1594         sock_hold(sk);
1595         unix_peer(newsk)        = sk;
1596         newsk->sk_state         = TCP_ESTABLISHED;
1597         newsk->sk_type          = sk->sk_type;
1598         init_peercred(newsk);
1599         newu = unix_sk(newsk);
1600         newu->listener = other;
1601         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1602         otheru = unix_sk(other);
1603
1604         /* copy address information from listening to new sock
1605          *
1606          * The contents of *(otheru->addr) and otheru->path
1607          * are seen fully set up here, since we have found
1608          * otheru in hash under its lock.  Insertion into the
1609          * hash chain we'd found it in had been done in an
1610          * earlier critical area protected by the chain's lock,
1611          * the same one where we'd set *(otheru->addr) contents,
1612          * as well as otheru->path and otheru->addr itself.
1613          *
1614          * Using smp_store_release() here to set newu->addr
1615          * is enough to make those stores, as well as stores
1616          * to newu->path visible to anyone who gets newu->addr
1617          * by smp_load_acquire().  IOW, the same warranties
1618          * as for unix_sock instances bound in unix_bind() or
1619          * in unix_autobind().
1620          */
1621         if (otheru->path.dentry) {
1622                 path_get(&otheru->path);
1623                 newu->path = otheru->path;
1624         }
1625         refcount_inc(&otheru->addr->refcnt);
1626         smp_store_release(&newu->addr, otheru->addr);
1627
1628         /* Set credentials */
1629         copy_peercred(sk, other);
1630
1631         sock->state     = SS_CONNECTED;
1632         sk->sk_state    = TCP_ESTABLISHED;
1633         sock_hold(newsk);
1634
1635         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1636         unix_peer(sk)   = newsk;
1637
1638         unix_state_unlock(sk);
1639
1640         /* take ten and send info to listening sock */
1641         spin_lock(&other->sk_receive_queue.lock);
1642         __skb_queue_tail(&other->sk_receive_queue, skb);
1643         spin_unlock(&other->sk_receive_queue.lock);
1644         unix_state_unlock(other);
1645         other->sk_data_ready(other);
1646         sock_put(other);
1647         return 0;
1648
1649 out_unlock:
1650         if (other)
1651                 unix_state_unlock(other);
1652
1653 out:
1654         kfree_skb(skb);
1655         if (newsk)
1656                 unix_release_sock(newsk, 0);
1657         if (other)
1658                 sock_put(other);
1659         return err;
1660 }
1661
1662 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1663 {
1664         struct sock *ska = socka->sk, *skb = sockb->sk;
1665
1666         /* Join our sockets back to back */
1667         sock_hold(ska);
1668         sock_hold(skb);
1669         unix_peer(ska) = skb;
1670         unix_peer(skb) = ska;
1671         init_peercred(ska);
1672         init_peercred(skb);
1673
1674         ska->sk_state = TCP_ESTABLISHED;
1675         skb->sk_state = TCP_ESTABLISHED;
1676         socka->state  = SS_CONNECTED;
1677         sockb->state  = SS_CONNECTED;
1678         return 0;
1679 }
1680
1681 static void unix_sock_inherit_flags(const struct socket *old,
1682                                     struct socket *new)
1683 {
1684         if (test_bit(SOCK_PASSCRED, &old->flags))
1685                 set_bit(SOCK_PASSCRED, &new->flags);
1686         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1687                 set_bit(SOCK_PASSPIDFD, &new->flags);
1688         if (test_bit(SOCK_PASSSEC, &old->flags))
1689                 set_bit(SOCK_PASSSEC, &new->flags);
1690 }
1691
1692 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1693                        bool kern)
1694 {
1695         struct sock *sk = sock->sk;
1696         struct sk_buff *skb;
1697         struct sock *tsk;
1698         int err;
1699
1700         err = -EOPNOTSUPP;
1701         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1702                 goto out;
1703
1704         err = -EINVAL;
1705         if (sk->sk_state != TCP_LISTEN)
1706                 goto out;
1707
1708         /* If socket state is TCP_LISTEN it cannot change (for now...),
1709          * so that no locks are necessary.
1710          */
1711
1712         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1713                                 &err);
1714         if (!skb) {
1715                 /* This means receive shutdown. */
1716                 if (err == 0)
1717                         err = -EINVAL;
1718                 goto out;
1719         }
1720
1721         tsk = skb->sk;
1722         skb_free_datagram(sk, skb);
1723         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1724
1725         /* attach accepted sock to socket */
1726         unix_state_lock(tsk);
1727         unix_update_edges(unix_sk(tsk));
1728         newsock->state = SS_CONNECTED;
1729         unix_sock_inherit_flags(sock, newsock);
1730         sock_graft(tsk, newsock);
1731         unix_state_unlock(tsk);
1732         return 0;
1733
1734 out:
1735         return err;
1736 }
1737
1738
1739 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1740 {
1741         struct sock *sk = sock->sk;
1742         struct unix_address *addr;
1743         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1744         int err = 0;
1745
1746         if (peer) {
1747                 sk = unix_peer_get(sk);
1748
1749                 err = -ENOTCONN;
1750                 if (!sk)
1751                         goto out;
1752                 err = 0;
1753         } else {
1754                 sock_hold(sk);
1755         }
1756
1757         addr = smp_load_acquire(&unix_sk(sk)->addr);
1758         if (!addr) {
1759                 sunaddr->sun_family = AF_UNIX;
1760                 sunaddr->sun_path[0] = 0;
1761                 err = offsetof(struct sockaddr_un, sun_path);
1762         } else {
1763                 err = addr->len;
1764                 memcpy(sunaddr, addr->name, addr->len);
1765
1766                 if (peer)
1767                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1768                                                CGROUP_UNIX_GETPEERNAME);
1769                 else
1770                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1771                                                CGROUP_UNIX_GETSOCKNAME);
1772         }
1773         sock_put(sk);
1774 out:
1775         return err;
1776 }
1777
1778 /* The "user->unix_inflight" variable is protected by the garbage
1779  * collection lock, and we just read it locklessly here. If you go
1780  * over the limit, there might be a tiny race in actually noticing
1781  * it across threads. Tough.
1782  */
1783 static inline bool too_many_unix_fds(struct task_struct *p)
1784 {
1785         struct user_struct *user = current_user();
1786
1787         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1788                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1789         return false;
1790 }
1791
1792 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1793 {
1794         if (too_many_unix_fds(current))
1795                 return -ETOOMANYREFS;
1796
1797         UNIXCB(skb).fp = scm->fp;
1798         scm->fp = NULL;
1799
1800         if (unix_prepare_fpl(UNIXCB(skb).fp))
1801                 return -ENOMEM;
1802
1803         return 0;
1804 }
1805
1806 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1807 {
1808         scm->fp = UNIXCB(skb).fp;
1809         UNIXCB(skb).fp = NULL;
1810
1811         unix_destroy_fpl(scm->fp);
1812 }
1813
1814 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1815 {
1816         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1817 }
1818
1819 static void unix_destruct_scm(struct sk_buff *skb)
1820 {
1821         struct scm_cookie scm;
1822
1823         memset(&scm, 0, sizeof(scm));
1824         scm.pid  = UNIXCB(skb).pid;
1825         if (UNIXCB(skb).fp)
1826                 unix_detach_fds(&scm, skb);
1827
1828         /* Alas, it calls VFS */
1829         /* So fscking what? fput() had been SMP-safe since the last Summer */
1830         scm_destroy(&scm);
1831         sock_wfree(skb);
1832 }
1833
1834 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1835 {
1836         int err = 0;
1837
1838         UNIXCB(skb).pid  = get_pid(scm->pid);
1839         UNIXCB(skb).uid = scm->creds.uid;
1840         UNIXCB(skb).gid = scm->creds.gid;
1841         UNIXCB(skb).fp = NULL;
1842         unix_get_secdata(scm, skb);
1843         if (scm->fp && send_fds)
1844                 err = unix_attach_fds(scm, skb);
1845
1846         skb->destructor = unix_destruct_scm;
1847         return err;
1848 }
1849
1850 static bool unix_passcred_enabled(const struct socket *sock,
1851                                   const struct sock *other)
1852 {
1853         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1854                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1855                !other->sk_socket ||
1856                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1857                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1858 }
1859
1860 /*
1861  * Some apps rely on write() giving SCM_CREDENTIALS
1862  * We include credentials if source or destination socket
1863  * asserted SOCK_PASSCRED.
1864  */
1865 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1866                             const struct sock *other)
1867 {
1868         if (UNIXCB(skb).pid)
1869                 return;
1870         if (unix_passcred_enabled(sock, other)) {
1871                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1872                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1873         }
1874 }
1875
1876 static bool unix_skb_scm_eq(struct sk_buff *skb,
1877                             struct scm_cookie *scm)
1878 {
1879         return UNIXCB(skb).pid == scm->pid &&
1880                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1881                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1882                unix_secdata_eq(scm, skb);
1883 }
1884
1885 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1886 {
1887         struct scm_fp_list *fp = UNIXCB(skb).fp;
1888         struct unix_sock *u = unix_sk(sk);
1889
1890         if (unlikely(fp && fp->count)) {
1891                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1892                 unix_add_edges(fp, u);
1893         }
1894 }
1895
1896 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1897 {
1898         struct scm_fp_list *fp = UNIXCB(skb).fp;
1899         struct unix_sock *u = unix_sk(sk);
1900
1901         if (unlikely(fp && fp->count)) {
1902                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1903                 unix_del_edges(fp);
1904         }
1905 }
1906
1907 /*
1908  *      Send AF_UNIX data.
1909  */
1910
1911 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1912                               size_t len)
1913 {
1914         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1915         struct sock *sk = sock->sk, *other = NULL;
1916         struct unix_sock *u = unix_sk(sk);
1917         struct scm_cookie scm;
1918         struct sk_buff *skb;
1919         int data_len = 0;
1920         int sk_locked;
1921         long timeo;
1922         int err;
1923
1924         err = scm_send(sock, msg, &scm, false);
1925         if (err < 0)
1926                 return err;
1927
1928         wait_for_unix_gc(scm.fp);
1929
1930         err = -EOPNOTSUPP;
1931         if (msg->msg_flags&MSG_OOB)
1932                 goto out;
1933
1934         if (msg->msg_namelen) {
1935                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1936                 if (err)
1937                         goto out;
1938
1939                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1940                                                             msg->msg_name,
1941                                                             &msg->msg_namelen,
1942                                                             NULL);
1943                 if (err)
1944                         goto out;
1945         } else {
1946                 sunaddr = NULL;
1947                 err = -ENOTCONN;
1948                 other = unix_peer_get(sk);
1949                 if (!other)
1950                         goto out;
1951         }
1952
1953         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1954              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1955                 err = unix_autobind(sk);
1956                 if (err)
1957                         goto out;
1958         }
1959
1960         err = -EMSGSIZE;
1961         if (len > sk->sk_sndbuf - 32)
1962                 goto out;
1963
1964         if (len > SKB_MAX_ALLOC) {
1965                 data_len = min_t(size_t,
1966                                  len - SKB_MAX_ALLOC,
1967                                  MAX_SKB_FRAGS * PAGE_SIZE);
1968                 data_len = PAGE_ALIGN(data_len);
1969
1970                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1971         }
1972
1973         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1974                                    msg->msg_flags & MSG_DONTWAIT, &err,
1975                                    PAGE_ALLOC_COSTLY_ORDER);
1976         if (skb == NULL)
1977                 goto out;
1978
1979         err = unix_scm_to_skb(&scm, skb, true);
1980         if (err < 0)
1981                 goto out_free;
1982
1983         skb_put(skb, len - data_len);
1984         skb->data_len = data_len;
1985         skb->len = len;
1986         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1987         if (err)
1988                 goto out_free;
1989
1990         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1991
1992 restart:
1993         if (!other) {
1994                 err = -ECONNRESET;
1995                 if (sunaddr == NULL)
1996                         goto out_free;
1997
1998                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1999                                         sk->sk_type);
2000                 if (IS_ERR(other)) {
2001                         err = PTR_ERR(other);
2002                         other = NULL;
2003                         goto out_free;
2004                 }
2005         }
2006
2007         if (sk_filter(other, skb) < 0) {
2008                 /* Toss the packet but do not return any error to the sender */
2009                 err = len;
2010                 goto out_free;
2011         }
2012
2013         sk_locked = 0;
2014         unix_state_lock(other);
2015 restart_locked:
2016         err = -EPERM;
2017         if (!unix_may_send(sk, other))
2018                 goto out_unlock;
2019
2020         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2021                 /*
2022                  *      Check with 1003.1g - what should
2023                  *      datagram error
2024                  */
2025                 unix_state_unlock(other);
2026                 sock_put(other);
2027
2028                 if (!sk_locked)
2029                         unix_state_lock(sk);
2030
2031                 err = 0;
2032                 if (sk->sk_type == SOCK_SEQPACKET) {
2033                         /* We are here only when racing with unix_release_sock()
2034                          * is clearing @other. Never change state to TCP_CLOSE
2035                          * unlike SOCK_DGRAM wants.
2036                          */
2037                         unix_state_unlock(sk);
2038                         err = -EPIPE;
2039                 } else if (unix_peer(sk) == other) {
2040                         unix_peer(sk) = NULL;
2041                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2042
2043                         sk->sk_state = TCP_CLOSE;
2044                         unix_state_unlock(sk);
2045
2046                         unix_dgram_disconnected(sk, other);
2047                         sock_put(other);
2048                         err = -ECONNREFUSED;
2049                 } else {
2050                         unix_state_unlock(sk);
2051                 }
2052
2053                 other = NULL;
2054                 if (err)
2055                         goto out_free;
2056                 goto restart;
2057         }
2058
2059         err = -EPIPE;
2060         if (other->sk_shutdown & RCV_SHUTDOWN)
2061                 goto out_unlock;
2062
2063         if (sk->sk_type != SOCK_SEQPACKET) {
2064                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2065                 if (err)
2066                         goto out_unlock;
2067         }
2068
2069         /* other == sk && unix_peer(other) != sk if
2070          * - unix_peer(sk) == NULL, destination address bound to sk
2071          * - unix_peer(sk) == sk by time of get but disconnected before lock
2072          */
2073         if (other != sk &&
2074             unlikely(unix_peer(other) != sk &&
2075             unix_recvq_full_lockless(other))) {
2076                 if (timeo) {
2077                         timeo = unix_wait_for_peer(other, timeo);
2078
2079                         err = sock_intr_errno(timeo);
2080                         if (signal_pending(current))
2081                                 goto out_free;
2082
2083                         goto restart;
2084                 }
2085
2086                 if (!sk_locked) {
2087                         unix_state_unlock(other);
2088                         unix_state_double_lock(sk, other);
2089                 }
2090
2091                 if (unix_peer(sk) != other ||
2092                     unix_dgram_peer_wake_me(sk, other)) {
2093                         err = -EAGAIN;
2094                         sk_locked = 1;
2095                         goto out_unlock;
2096                 }
2097
2098                 if (!sk_locked) {
2099                         sk_locked = 1;
2100                         goto restart_locked;
2101                 }
2102         }
2103
2104         if (unlikely(sk_locked))
2105                 unix_state_unlock(sk);
2106
2107         if (sock_flag(other, SOCK_RCVTSTAMP))
2108                 __net_timestamp(skb);
2109         maybe_add_creds(skb, sock, other);
2110         scm_stat_add(other, skb);
2111         skb_queue_tail(&other->sk_receive_queue, skb);
2112         unix_state_unlock(other);
2113         other->sk_data_ready(other);
2114         sock_put(other);
2115         scm_destroy(&scm);
2116         return len;
2117
2118 out_unlock:
2119         if (sk_locked)
2120                 unix_state_unlock(sk);
2121         unix_state_unlock(other);
2122 out_free:
2123         kfree_skb(skb);
2124 out:
2125         if (other)
2126                 sock_put(other);
2127         scm_destroy(&scm);
2128         return err;
2129 }
2130
2131 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2132  * bytes, and a minimum of a full page.
2133  */
2134 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2135
2136 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2137 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2138                      struct scm_cookie *scm, bool fds_sent)
2139 {
2140         struct unix_sock *ousk = unix_sk(other);
2141         struct sk_buff *skb;
2142         int err = 0;
2143
2144         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2145
2146         if (!skb)
2147                 return err;
2148
2149         err = unix_scm_to_skb(scm, skb, !fds_sent);
2150         if (err < 0) {
2151                 kfree_skb(skb);
2152                 return err;
2153         }
2154         skb_put(skb, 1);
2155         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2156
2157         if (err) {
2158                 kfree_skb(skb);
2159                 return err;
2160         }
2161
2162         unix_state_lock(other);
2163
2164         if (sock_flag(other, SOCK_DEAD) ||
2165             (other->sk_shutdown & RCV_SHUTDOWN)) {
2166                 unix_state_unlock(other);
2167                 kfree_skb(skb);
2168                 return -EPIPE;
2169         }
2170
2171         maybe_add_creds(skb, sock, other);
2172         skb_get(skb);
2173
2174         if (ousk->oob_skb)
2175                 consume_skb(ousk->oob_skb);
2176
2177         WRITE_ONCE(ousk->oob_skb, skb);
2178
2179         scm_stat_add(other, skb);
2180         skb_queue_tail(&other->sk_receive_queue, skb);
2181         sk_send_sigurg(other);
2182         unix_state_unlock(other);
2183         other->sk_data_ready(other);
2184
2185         return err;
2186 }
2187 #endif
2188
2189 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2190                                size_t len)
2191 {
2192         struct sock *sk = sock->sk;
2193         struct sock *other = NULL;
2194         int err, size;
2195         struct sk_buff *skb;
2196         int sent = 0;
2197         struct scm_cookie scm;
2198         bool fds_sent = false;
2199         int data_len;
2200
2201         err = scm_send(sock, msg, &scm, false);
2202         if (err < 0)
2203                 return err;
2204
2205         wait_for_unix_gc(scm.fp);
2206
2207         err = -EOPNOTSUPP;
2208         if (msg->msg_flags & MSG_OOB) {
2209 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2210                 if (len)
2211                         len--;
2212                 else
2213 #endif
2214                         goto out_err;
2215         }
2216
2217         if (msg->msg_namelen) {
2218                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2219                 goto out_err;
2220         } else {
2221                 err = -ENOTCONN;
2222                 other = unix_peer(sk);
2223                 if (!other)
2224                         goto out_err;
2225         }
2226
2227         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2228                 goto pipe_err;
2229
2230         while (sent < len) {
2231                 size = len - sent;
2232
2233                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2234                         skb = sock_alloc_send_pskb(sk, 0, 0,
2235                                                    msg->msg_flags & MSG_DONTWAIT,
2236                                                    &err, 0);
2237                 } else {
2238                         /* Keep two messages in the pipe so it schedules better */
2239                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2240
2241                         /* allow fallback to order-0 allocations */
2242                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2243
2244                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2245
2246                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2247
2248                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2249                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2250                                                    get_order(UNIX_SKB_FRAGS_SZ));
2251                 }
2252                 if (!skb)
2253                         goto out_err;
2254
2255                 /* Only send the fds in the first buffer */
2256                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2257                 if (err < 0) {
2258                         kfree_skb(skb);
2259                         goto out_err;
2260                 }
2261                 fds_sent = true;
2262
2263                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2264                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2265                                                    sk->sk_allocation);
2266                         if (err < 0) {
2267                                 kfree_skb(skb);
2268                                 goto out_err;
2269                         }
2270                         size = err;
2271                         refcount_add(size, &sk->sk_wmem_alloc);
2272                 } else {
2273                         skb_put(skb, size - data_len);
2274                         skb->data_len = data_len;
2275                         skb->len = size;
2276                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2277                         if (err) {
2278                                 kfree_skb(skb);
2279                                 goto out_err;
2280                         }
2281                 }
2282
2283                 unix_state_lock(other);
2284
2285                 if (sock_flag(other, SOCK_DEAD) ||
2286                     (other->sk_shutdown & RCV_SHUTDOWN))
2287                         goto pipe_err_free;
2288
2289                 maybe_add_creds(skb, sock, other);
2290                 scm_stat_add(other, skb);
2291                 skb_queue_tail(&other->sk_receive_queue, skb);
2292                 unix_state_unlock(other);
2293                 other->sk_data_ready(other);
2294                 sent += size;
2295         }
2296
2297 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2298         if (msg->msg_flags & MSG_OOB) {
2299                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2300                 if (err)
2301                         goto out_err;
2302                 sent++;
2303         }
2304 #endif
2305
2306         scm_destroy(&scm);
2307
2308         return sent;
2309
2310 pipe_err_free:
2311         unix_state_unlock(other);
2312         kfree_skb(skb);
2313 pipe_err:
2314         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2315                 send_sig(SIGPIPE, current, 0);
2316         err = -EPIPE;
2317 out_err:
2318         scm_destroy(&scm);
2319         return sent ? : err;
2320 }
2321
2322 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2323                                   size_t len)
2324 {
2325         int err;
2326         struct sock *sk = sock->sk;
2327
2328         err = sock_error(sk);
2329         if (err)
2330                 return err;
2331
2332         if (sk->sk_state != TCP_ESTABLISHED)
2333                 return -ENOTCONN;
2334
2335         if (msg->msg_namelen)
2336                 msg->msg_namelen = 0;
2337
2338         return unix_dgram_sendmsg(sock, msg, len);
2339 }
2340
2341 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2342                                   size_t size, int flags)
2343 {
2344         struct sock *sk = sock->sk;
2345
2346         if (sk->sk_state != TCP_ESTABLISHED)
2347                 return -ENOTCONN;
2348
2349         return unix_dgram_recvmsg(sock, msg, size, flags);
2350 }
2351
2352 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2353 {
2354         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2355
2356         if (addr) {
2357                 msg->msg_namelen = addr->len;
2358                 memcpy(msg->msg_name, addr->name, addr->len);
2359         }
2360 }
2361
2362 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2363                          int flags)
2364 {
2365         struct scm_cookie scm;
2366         struct socket *sock = sk->sk_socket;
2367         struct unix_sock *u = unix_sk(sk);
2368         struct sk_buff *skb, *last;
2369         long timeo;
2370         int skip;
2371         int err;
2372
2373         err = -EOPNOTSUPP;
2374         if (flags&MSG_OOB)
2375                 goto out;
2376
2377         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2378
2379         do {
2380                 mutex_lock(&u->iolock);
2381
2382                 skip = sk_peek_offset(sk, flags);
2383                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2384                                               &skip, &err, &last);
2385                 if (skb) {
2386                         if (!(flags & MSG_PEEK))
2387                                 scm_stat_del(sk, skb);
2388                         break;
2389                 }
2390
2391                 mutex_unlock(&u->iolock);
2392
2393                 if (err != -EAGAIN)
2394                         break;
2395         } while (timeo &&
2396                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2397                                               &err, &timeo, last));
2398
2399         if (!skb) { /* implies iolock unlocked */
2400                 unix_state_lock(sk);
2401                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2402                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2403                     (sk->sk_shutdown & RCV_SHUTDOWN))
2404                         err = 0;
2405                 unix_state_unlock(sk);
2406                 goto out;
2407         }
2408
2409         if (wq_has_sleeper(&u->peer_wait))
2410                 wake_up_interruptible_sync_poll(&u->peer_wait,
2411                                                 EPOLLOUT | EPOLLWRNORM |
2412                                                 EPOLLWRBAND);
2413
2414         if (msg->msg_name) {
2415                 unix_copy_addr(msg, skb->sk);
2416
2417                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2418                                                       msg->msg_name,
2419                                                       &msg->msg_namelen);
2420         }
2421
2422         if (size > skb->len - skip)
2423                 size = skb->len - skip;
2424         else if (size < skb->len - skip)
2425                 msg->msg_flags |= MSG_TRUNC;
2426
2427         err = skb_copy_datagram_msg(skb, skip, msg, size);
2428         if (err)
2429                 goto out_free;
2430
2431         if (sock_flag(sk, SOCK_RCVTSTAMP))
2432                 __sock_recv_timestamp(msg, sk, skb);
2433
2434         memset(&scm, 0, sizeof(scm));
2435
2436         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2437         unix_set_secdata(&scm, skb);
2438
2439         if (!(flags & MSG_PEEK)) {
2440                 if (UNIXCB(skb).fp)
2441                         unix_detach_fds(&scm, skb);
2442
2443                 sk_peek_offset_bwd(sk, skb->len);
2444         } else {
2445                 /* It is questionable: on PEEK we could:
2446                    - do not return fds - good, but too simple 8)
2447                    - return fds, and do not return them on read (old strategy,
2448                      apparently wrong)
2449                    - clone fds (I chose it for now, it is the most universal
2450                      solution)
2451
2452                    POSIX 1003.1g does not actually define this clearly
2453                    at all. POSIX 1003.1g doesn't define a lot of things
2454                    clearly however!
2455
2456                 */
2457
2458                 sk_peek_offset_fwd(sk, size);
2459
2460                 if (UNIXCB(skb).fp)
2461                         unix_peek_fds(&scm, skb);
2462         }
2463         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2464
2465         scm_recv_unix(sock, msg, &scm, flags);
2466
2467 out_free:
2468         skb_free_datagram(sk, skb);
2469         mutex_unlock(&u->iolock);
2470 out:
2471         return err;
2472 }
2473
2474 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2475                               int flags)
2476 {
2477         struct sock *sk = sock->sk;
2478
2479 #ifdef CONFIG_BPF_SYSCALL
2480         const struct proto *prot = READ_ONCE(sk->sk_prot);
2481
2482         if (prot != &unix_dgram_proto)
2483                 return prot->recvmsg(sk, msg, size, flags, NULL);
2484 #endif
2485         return __unix_dgram_recvmsg(sk, msg, size, flags);
2486 }
2487
2488 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2489 {
2490         struct unix_sock *u = unix_sk(sk);
2491         struct sk_buff *skb;
2492         int err;
2493
2494         mutex_lock(&u->iolock);
2495         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2496         mutex_unlock(&u->iolock);
2497         if (!skb)
2498                 return err;
2499
2500         return recv_actor(sk, skb);
2501 }
2502
2503 /*
2504  *      Sleep until more data has arrived. But check for races..
2505  */
2506 static long unix_stream_data_wait(struct sock *sk, long timeo,
2507                                   struct sk_buff *last, unsigned int last_len,
2508                                   bool freezable)
2509 {
2510         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2511         struct sk_buff *tail;
2512         DEFINE_WAIT(wait);
2513
2514         unix_state_lock(sk);
2515
2516         for (;;) {
2517                 prepare_to_wait(sk_sleep(sk), &wait, state);
2518
2519                 tail = skb_peek_tail(&sk->sk_receive_queue);
2520                 if (tail != last ||
2521                     (tail && tail->len != last_len) ||
2522                     sk->sk_err ||
2523                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2524                     signal_pending(current) ||
2525                     !timeo)
2526                         break;
2527
2528                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2529                 unix_state_unlock(sk);
2530                 timeo = schedule_timeout(timeo);
2531                 unix_state_lock(sk);
2532
2533                 if (sock_flag(sk, SOCK_DEAD))
2534                         break;
2535
2536                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2537         }
2538
2539         finish_wait(sk_sleep(sk), &wait);
2540         unix_state_unlock(sk);
2541         return timeo;
2542 }
2543
2544 static unsigned int unix_skb_len(const struct sk_buff *skb)
2545 {
2546         return skb->len - UNIXCB(skb).consumed;
2547 }
2548
2549 struct unix_stream_read_state {
2550         int (*recv_actor)(struct sk_buff *, int, int,
2551                           struct unix_stream_read_state *);
2552         struct socket *socket;
2553         struct msghdr *msg;
2554         struct pipe_inode_info *pipe;
2555         size_t size;
2556         int flags;
2557         unsigned int splice_flags;
2558 };
2559
2560 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2561 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2562 {
2563         struct socket *sock = state->socket;
2564         struct sock *sk = sock->sk;
2565         struct unix_sock *u = unix_sk(sk);
2566         int chunk = 1;
2567         struct sk_buff *oob_skb;
2568
2569         mutex_lock(&u->iolock);
2570         unix_state_lock(sk);
2571
2572         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2573                 unix_state_unlock(sk);
2574                 mutex_unlock(&u->iolock);
2575                 return -EINVAL;
2576         }
2577
2578         oob_skb = u->oob_skb;
2579
2580         if (!(state->flags & MSG_PEEK))
2581                 WRITE_ONCE(u->oob_skb, NULL);
2582         else
2583                 skb_get(oob_skb);
2584         unix_state_unlock(sk);
2585
2586         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2587
2588         if (!(state->flags & MSG_PEEK))
2589                 UNIXCB(oob_skb).consumed += 1;
2590
2591         consume_skb(oob_skb);
2592
2593         mutex_unlock(&u->iolock);
2594
2595         if (chunk < 0)
2596                 return -EFAULT;
2597
2598         state->msg->msg_flags |= MSG_OOB;
2599         return 1;
2600 }
2601
2602 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2603                                   int flags, int copied)
2604 {
2605         struct unix_sock *u = unix_sk(sk);
2606
2607         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2608                 skb_unlink(skb, &sk->sk_receive_queue);
2609                 consume_skb(skb);
2610                 skb = NULL;
2611         } else {
2612                 if (skb == u->oob_skb) {
2613                         if (copied) {
2614                                 skb = NULL;
2615                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2616                                 if (!(flags & MSG_PEEK)) {
2617                                         WRITE_ONCE(u->oob_skb, NULL);
2618                                         consume_skb(skb);
2619                                 }
2620                         } else if (flags & MSG_PEEK) {
2621                                 skb = NULL;
2622                         } else {
2623                                 skb_unlink(skb, &sk->sk_receive_queue);
2624                                 WRITE_ONCE(u->oob_skb, NULL);
2625                                 if (!WARN_ON_ONCE(skb_unref(skb)))
2626                                         kfree_skb(skb);
2627                                 skb = skb_peek(&sk->sk_receive_queue);
2628                         }
2629                 }
2630         }
2631         return skb;
2632 }
2633 #endif
2634
2635 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2636 {
2637         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2638                 return -ENOTCONN;
2639
2640         return unix_read_skb(sk, recv_actor);
2641 }
2642
2643 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2644                                     bool freezable)
2645 {
2646         struct scm_cookie scm;
2647         struct socket *sock = state->socket;
2648         struct sock *sk = sock->sk;
2649         struct unix_sock *u = unix_sk(sk);
2650         int copied = 0;
2651         int flags = state->flags;
2652         int noblock = flags & MSG_DONTWAIT;
2653         bool check_creds = false;
2654         int target;
2655         int err = 0;
2656         long timeo;
2657         int skip;
2658         size_t size = state->size;
2659         unsigned int last_len;
2660
2661         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2662                 err = -EINVAL;
2663                 goto out;
2664         }
2665
2666         if (unlikely(flags & MSG_OOB)) {
2667                 err = -EOPNOTSUPP;
2668 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2669                 err = unix_stream_recv_urg(state);
2670 #endif
2671                 goto out;
2672         }
2673
2674         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2675         timeo = sock_rcvtimeo(sk, noblock);
2676
2677         memset(&scm, 0, sizeof(scm));
2678
2679         /* Lock the socket to prevent queue disordering
2680          * while sleeps in memcpy_tomsg
2681          */
2682         mutex_lock(&u->iolock);
2683
2684         skip = max(sk_peek_offset(sk, flags), 0);
2685
2686         do {
2687                 int chunk;
2688                 bool drop_skb;
2689                 struct sk_buff *skb, *last;
2690
2691 redo:
2692                 unix_state_lock(sk);
2693                 if (sock_flag(sk, SOCK_DEAD)) {
2694                         err = -ECONNRESET;
2695                         goto unlock;
2696                 }
2697                 last = skb = skb_peek(&sk->sk_receive_queue);
2698                 last_len = last ? last->len : 0;
2699
2700 again:
2701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2702                 if (skb) {
2703                         skb = manage_oob(skb, sk, flags, copied);
2704                         if (!skb && copied) {
2705                                 unix_state_unlock(sk);
2706                                 break;
2707                         }
2708                 }
2709 #endif
2710                 if (skb == NULL) {
2711                         if (copied >= target)
2712                                 goto unlock;
2713
2714                         /*
2715                          *      POSIX 1003.1g mandates this order.
2716                          */
2717
2718                         err = sock_error(sk);
2719                         if (err)
2720                                 goto unlock;
2721                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2722                                 goto unlock;
2723
2724                         unix_state_unlock(sk);
2725                         if (!timeo) {
2726                                 err = -EAGAIN;
2727                                 break;
2728                         }
2729
2730                         mutex_unlock(&u->iolock);
2731
2732                         timeo = unix_stream_data_wait(sk, timeo, last,
2733                                                       last_len, freezable);
2734
2735                         if (signal_pending(current)) {
2736                                 err = sock_intr_errno(timeo);
2737                                 scm_destroy(&scm);
2738                                 goto out;
2739                         }
2740
2741                         mutex_lock(&u->iolock);
2742                         goto redo;
2743 unlock:
2744                         unix_state_unlock(sk);
2745                         break;
2746                 }
2747
2748                 while (skip >= unix_skb_len(skb)) {
2749                         skip -= unix_skb_len(skb);
2750                         last = skb;
2751                         last_len = skb->len;
2752                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2753                         if (!skb)
2754                                 goto again;
2755                 }
2756
2757                 unix_state_unlock(sk);
2758
2759                 if (check_creds) {
2760                         /* Never glue messages from different writers */
2761                         if (!unix_skb_scm_eq(skb, &scm))
2762                                 break;
2763                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2764                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2765                         /* Copy credentials */
2766                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2767                         unix_set_secdata(&scm, skb);
2768                         check_creds = true;
2769                 }
2770
2771                 /* Copy address just once */
2772                 if (state->msg && state->msg->msg_name) {
2773                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2774                                          state->msg->msg_name);
2775                         unix_copy_addr(state->msg, skb->sk);
2776
2777                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2778                                                               state->msg->msg_name,
2779                                                               &state->msg->msg_namelen);
2780
2781                         sunaddr = NULL;
2782                 }
2783
2784                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2785                 skb_get(skb);
2786                 chunk = state->recv_actor(skb, skip, chunk, state);
2787                 drop_skb = !unix_skb_len(skb);
2788                 /* skb is only safe to use if !drop_skb */
2789                 consume_skb(skb);
2790                 if (chunk < 0) {
2791                         if (copied == 0)
2792                                 copied = -EFAULT;
2793                         break;
2794                 }
2795                 copied += chunk;
2796                 size -= chunk;
2797
2798                 if (drop_skb) {
2799                         /* the skb was touched by a concurrent reader;
2800                          * we should not expect anything from this skb
2801                          * anymore and assume it invalid - we can be
2802                          * sure it was dropped from the socket queue
2803                          *
2804                          * let's report a short read
2805                          */
2806                         err = 0;
2807                         break;
2808                 }
2809
2810                 /* Mark read part of skb as used */
2811                 if (!(flags & MSG_PEEK)) {
2812                         UNIXCB(skb).consumed += chunk;
2813
2814                         sk_peek_offset_bwd(sk, chunk);
2815
2816                         if (UNIXCB(skb).fp) {
2817                                 scm_stat_del(sk, skb);
2818                                 unix_detach_fds(&scm, skb);
2819                         }
2820
2821                         if (unix_skb_len(skb))
2822                                 break;
2823
2824                         skb_unlink(skb, &sk->sk_receive_queue);
2825                         consume_skb(skb);
2826
2827                         if (scm.fp)
2828                                 break;
2829                 } else {
2830                         /* It is questionable, see note in unix_dgram_recvmsg.
2831                          */
2832                         if (UNIXCB(skb).fp)
2833                                 unix_peek_fds(&scm, skb);
2834
2835                         sk_peek_offset_fwd(sk, chunk);
2836
2837                         if (UNIXCB(skb).fp)
2838                                 break;
2839
2840                         skip = 0;
2841                         last = skb;
2842                         last_len = skb->len;
2843                         unix_state_lock(sk);
2844                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2845                         if (skb)
2846                                 goto again;
2847                         unix_state_unlock(sk);
2848                         break;
2849                 }
2850         } while (size);
2851
2852         mutex_unlock(&u->iolock);
2853         if (state->msg)
2854                 scm_recv_unix(sock, state->msg, &scm, flags);
2855         else
2856                 scm_destroy(&scm);
2857 out:
2858         return copied ? : err;
2859 }
2860
2861 static int unix_stream_read_actor(struct sk_buff *skb,
2862                                   int skip, int chunk,
2863                                   struct unix_stream_read_state *state)
2864 {
2865         int ret;
2866
2867         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2868                                     state->msg, chunk);
2869         return ret ?: chunk;
2870 }
2871
2872 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2873                           size_t size, int flags)
2874 {
2875         struct unix_stream_read_state state = {
2876                 .recv_actor = unix_stream_read_actor,
2877                 .socket = sk->sk_socket,
2878                 .msg = msg,
2879                 .size = size,
2880                 .flags = flags
2881         };
2882
2883         return unix_stream_read_generic(&state, true);
2884 }
2885
2886 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2887                                size_t size, int flags)
2888 {
2889         struct unix_stream_read_state state = {
2890                 .recv_actor = unix_stream_read_actor,
2891                 .socket = sock,
2892                 .msg = msg,
2893                 .size = size,
2894                 .flags = flags
2895         };
2896
2897 #ifdef CONFIG_BPF_SYSCALL
2898         struct sock *sk = sock->sk;
2899         const struct proto *prot = READ_ONCE(sk->sk_prot);
2900
2901         if (prot != &unix_stream_proto)
2902                 return prot->recvmsg(sk, msg, size, flags, NULL);
2903 #endif
2904         return unix_stream_read_generic(&state, true);
2905 }
2906
2907 static int unix_stream_splice_actor(struct sk_buff *skb,
2908                                     int skip, int chunk,
2909                                     struct unix_stream_read_state *state)
2910 {
2911         return skb_splice_bits(skb, state->socket->sk,
2912                                UNIXCB(skb).consumed + skip,
2913                                state->pipe, chunk, state->splice_flags);
2914 }
2915
2916 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2917                                        struct pipe_inode_info *pipe,
2918                                        size_t size, unsigned int flags)
2919 {
2920         struct unix_stream_read_state state = {
2921                 .recv_actor = unix_stream_splice_actor,
2922                 .socket = sock,
2923                 .pipe = pipe,
2924                 .size = size,
2925                 .splice_flags = flags,
2926         };
2927
2928         if (unlikely(*ppos))
2929                 return -ESPIPE;
2930
2931         if (sock->file->f_flags & O_NONBLOCK ||
2932             flags & SPLICE_F_NONBLOCK)
2933                 state.flags = MSG_DONTWAIT;
2934
2935         return unix_stream_read_generic(&state, false);
2936 }
2937
2938 static int unix_shutdown(struct socket *sock, int mode)
2939 {
2940         struct sock *sk = sock->sk;
2941         struct sock *other;
2942
2943         if (mode < SHUT_RD || mode > SHUT_RDWR)
2944                 return -EINVAL;
2945         /* This maps:
2946          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2947          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2948          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2949          */
2950         ++mode;
2951
2952         unix_state_lock(sk);
2953         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2954         other = unix_peer(sk);
2955         if (other)
2956                 sock_hold(other);
2957         unix_state_unlock(sk);
2958         sk->sk_state_change(sk);
2959
2960         if (other &&
2961                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2962
2963                 int peer_mode = 0;
2964                 const struct proto *prot = READ_ONCE(other->sk_prot);
2965
2966                 if (prot->unhash)
2967                         prot->unhash(other);
2968                 if (mode&RCV_SHUTDOWN)
2969                         peer_mode |= SEND_SHUTDOWN;
2970                 if (mode&SEND_SHUTDOWN)
2971                         peer_mode |= RCV_SHUTDOWN;
2972                 unix_state_lock(other);
2973                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2974                 unix_state_unlock(other);
2975                 other->sk_state_change(other);
2976                 if (peer_mode == SHUTDOWN_MASK)
2977                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2978                 else if (peer_mode & RCV_SHUTDOWN)
2979                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2980         }
2981         if (other)
2982                 sock_put(other);
2983
2984         return 0;
2985 }
2986
2987 long unix_inq_len(struct sock *sk)
2988 {
2989         struct sk_buff *skb;
2990         long amount = 0;
2991
2992         if (sk->sk_state == TCP_LISTEN)
2993                 return -EINVAL;
2994
2995         spin_lock(&sk->sk_receive_queue.lock);
2996         if (sk->sk_type == SOCK_STREAM ||
2997             sk->sk_type == SOCK_SEQPACKET) {
2998                 skb_queue_walk(&sk->sk_receive_queue, skb)
2999                         amount += unix_skb_len(skb);
3000         } else {
3001                 skb = skb_peek(&sk->sk_receive_queue);
3002                 if (skb)
3003                         amount = skb->len;
3004         }
3005         spin_unlock(&sk->sk_receive_queue.lock);
3006
3007         return amount;
3008 }
3009 EXPORT_SYMBOL_GPL(unix_inq_len);
3010
3011 long unix_outq_len(struct sock *sk)
3012 {
3013         return sk_wmem_alloc_get(sk);
3014 }
3015 EXPORT_SYMBOL_GPL(unix_outq_len);
3016
3017 static int unix_open_file(struct sock *sk)
3018 {
3019         struct path path;
3020         struct file *f;
3021         int fd;
3022
3023         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3024                 return -EPERM;
3025
3026         if (!smp_load_acquire(&unix_sk(sk)->addr))
3027                 return -ENOENT;
3028
3029         path = unix_sk(sk)->path;
3030         if (!path.dentry)
3031                 return -ENOENT;
3032
3033         path_get(&path);
3034
3035         fd = get_unused_fd_flags(O_CLOEXEC);
3036         if (fd < 0)
3037                 goto out;
3038
3039         f = dentry_open(&path, O_PATH, current_cred());
3040         if (IS_ERR(f)) {
3041                 put_unused_fd(fd);
3042                 fd = PTR_ERR(f);
3043                 goto out;
3044         }
3045
3046         fd_install(fd, f);
3047 out:
3048         path_put(&path);
3049
3050         return fd;
3051 }
3052
3053 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3054 {
3055         struct sock *sk = sock->sk;
3056         long amount = 0;
3057         int err;
3058
3059         switch (cmd) {
3060         case SIOCOUTQ:
3061                 amount = unix_outq_len(sk);
3062                 err = put_user(amount, (int __user *)arg);
3063                 break;
3064         case SIOCINQ:
3065                 amount = unix_inq_len(sk);
3066                 if (amount < 0)
3067                         err = amount;
3068                 else
3069                         err = put_user(amount, (int __user *)arg);
3070                 break;
3071         case SIOCUNIXFILE:
3072                 err = unix_open_file(sk);
3073                 break;
3074 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3075         case SIOCATMARK:
3076                 {
3077                         struct sk_buff *skb;
3078                         int answ = 0;
3079
3080                         skb = skb_peek(&sk->sk_receive_queue);
3081                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3082                                 answ = 1;
3083                         err = put_user(answ, (int __user *)arg);
3084                 }
3085                 break;
3086 #endif
3087         default:
3088                 err = -ENOIOCTLCMD;
3089                 break;
3090         }
3091         return err;
3092 }
3093
3094 #ifdef CONFIG_COMPAT
3095 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3096 {
3097         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3098 }
3099 #endif
3100
3101 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3102 {
3103         struct sock *sk = sock->sk;
3104         __poll_t mask;
3105         u8 shutdown;
3106
3107         sock_poll_wait(file, sock, wait);
3108         mask = 0;
3109         shutdown = READ_ONCE(sk->sk_shutdown);
3110
3111         /* exceptional events? */
3112         if (READ_ONCE(sk->sk_err))
3113                 mask |= EPOLLERR;
3114         if (shutdown == SHUTDOWN_MASK)
3115                 mask |= EPOLLHUP;
3116         if (shutdown & RCV_SHUTDOWN)
3117                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3118
3119         /* readable? */
3120         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3121                 mask |= EPOLLIN | EPOLLRDNORM;
3122         if (sk_is_readable(sk))
3123                 mask |= EPOLLIN | EPOLLRDNORM;
3124 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3125         if (READ_ONCE(unix_sk(sk)->oob_skb))
3126                 mask |= EPOLLPRI;
3127 #endif
3128
3129         /* Connection-based need to check for termination and startup */
3130         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3131             sk->sk_state == TCP_CLOSE)
3132                 mask |= EPOLLHUP;
3133
3134         /*
3135          * we set writable also when the other side has shut down the
3136          * connection. This prevents stuck sockets.
3137          */
3138         if (unix_writable(sk))
3139                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3140
3141         return mask;
3142 }
3143
3144 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3145                                     poll_table *wait)
3146 {
3147         struct sock *sk = sock->sk, *other;
3148         unsigned int writable;
3149         __poll_t mask;
3150         u8 shutdown;
3151
3152         sock_poll_wait(file, sock, wait);
3153         mask = 0;
3154         shutdown = READ_ONCE(sk->sk_shutdown);
3155
3156         /* exceptional events? */
3157         if (READ_ONCE(sk->sk_err) ||
3158             !skb_queue_empty_lockless(&sk->sk_error_queue))
3159                 mask |= EPOLLERR |
3160                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3161
3162         if (shutdown & RCV_SHUTDOWN)
3163                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3164         if (shutdown == SHUTDOWN_MASK)
3165                 mask |= EPOLLHUP;
3166
3167         /* readable? */
3168         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3169                 mask |= EPOLLIN | EPOLLRDNORM;
3170         if (sk_is_readable(sk))
3171                 mask |= EPOLLIN | EPOLLRDNORM;
3172
3173         /* Connection-based need to check for termination and startup */
3174         if (sk->sk_type == SOCK_SEQPACKET) {
3175                 if (sk->sk_state == TCP_CLOSE)
3176                         mask |= EPOLLHUP;
3177                 /* connection hasn't started yet? */
3178                 if (sk->sk_state == TCP_SYN_SENT)
3179                         return mask;
3180         }
3181
3182         /* No write status requested, avoid expensive OUT tests. */
3183         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3184                 return mask;
3185
3186         writable = unix_writable(sk);
3187         if (writable) {
3188                 unix_state_lock(sk);
3189
3190                 other = unix_peer(sk);
3191                 if (other && unix_peer(other) != sk &&
3192                     unix_recvq_full_lockless(other) &&
3193                     unix_dgram_peer_wake_me(sk, other))
3194                         writable = 0;
3195
3196                 unix_state_unlock(sk);
3197         }
3198
3199         if (writable)
3200                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3201         else
3202                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3203
3204         return mask;
3205 }
3206
3207 #ifdef CONFIG_PROC_FS
3208
3209 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3210
3211 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3212 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3213 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3214
3215 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3216 {
3217         unsigned long offset = get_offset(*pos);
3218         unsigned long bucket = get_bucket(*pos);
3219         unsigned long count = 0;
3220         struct sock *sk;
3221
3222         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3223              sk; sk = sk_next(sk)) {
3224                 if (++count == offset)
3225                         break;
3226         }
3227
3228         return sk;
3229 }
3230
3231 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3232 {
3233         unsigned long bucket = get_bucket(*pos);
3234         struct net *net = seq_file_net(seq);
3235         struct sock *sk;
3236
3237         while (bucket < UNIX_HASH_SIZE) {
3238                 spin_lock(&net->unx.table.locks[bucket]);
3239
3240                 sk = unix_from_bucket(seq, pos);
3241                 if (sk)
3242                         return sk;
3243
3244                 spin_unlock(&net->unx.table.locks[bucket]);
3245
3246                 *pos = set_bucket_offset(++bucket, 1);
3247         }
3248
3249         return NULL;
3250 }
3251
3252 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3253                                   loff_t *pos)
3254 {
3255         unsigned long bucket = get_bucket(*pos);
3256
3257         sk = sk_next(sk);
3258         if (sk)
3259                 return sk;
3260
3261
3262         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3263
3264         *pos = set_bucket_offset(++bucket, 1);
3265
3266         return unix_get_first(seq, pos);
3267 }
3268
3269 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3270 {
3271         if (!*pos)
3272                 return SEQ_START_TOKEN;
3273
3274         return unix_get_first(seq, pos);
3275 }
3276
3277 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3278 {
3279         ++*pos;
3280
3281         if (v == SEQ_START_TOKEN)
3282                 return unix_get_first(seq, pos);
3283
3284         return unix_get_next(seq, v, pos);
3285 }
3286
3287 static void unix_seq_stop(struct seq_file *seq, void *v)
3288 {
3289         struct sock *sk = v;
3290
3291         if (sk)
3292                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3293 }
3294
3295 static int unix_seq_show(struct seq_file *seq, void *v)
3296 {
3297
3298         if (v == SEQ_START_TOKEN)
3299                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3300                          "Inode Path\n");
3301         else {
3302                 struct sock *s = v;
3303                 struct unix_sock *u = unix_sk(s);
3304                 unix_state_lock(s);
3305
3306                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3307                         s,
3308                         refcount_read(&s->sk_refcnt),
3309                         0,
3310                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3311                         s->sk_type,
3312                         s->sk_socket ?
3313                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3314                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3315                         sock_i_ino(s));
3316
3317                 if (u->addr) {  // under a hash table lock here
3318                         int i, len;
3319                         seq_putc(seq, ' ');
3320
3321                         i = 0;
3322                         len = u->addr->len -
3323                                 offsetof(struct sockaddr_un, sun_path);
3324                         if (u->addr->name->sun_path[0]) {
3325                                 len--;
3326                         } else {
3327                                 seq_putc(seq, '@');
3328                                 i++;
3329                         }
3330                         for ( ; i < len; i++)
3331                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3332                                          '@');
3333                 }
3334                 unix_state_unlock(s);
3335                 seq_putc(seq, '\n');
3336         }
3337
3338         return 0;
3339 }
3340
3341 static const struct seq_operations unix_seq_ops = {
3342         .start  = unix_seq_start,
3343         .next   = unix_seq_next,
3344         .stop   = unix_seq_stop,
3345         .show   = unix_seq_show,
3346 };
3347
3348 #ifdef CONFIG_BPF_SYSCALL
3349 struct bpf_unix_iter_state {
3350         struct seq_net_private p;
3351         unsigned int cur_sk;
3352         unsigned int end_sk;
3353         unsigned int max_sk;
3354         struct sock **batch;
3355         bool st_bucket_done;
3356 };
3357
3358 struct bpf_iter__unix {
3359         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3360         __bpf_md_ptr(struct unix_sock *, unix_sk);
3361         uid_t uid __aligned(8);
3362 };
3363
3364 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3365                               struct unix_sock *unix_sk, uid_t uid)
3366 {
3367         struct bpf_iter__unix ctx;
3368
3369         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3370         ctx.meta = meta;
3371         ctx.unix_sk = unix_sk;
3372         ctx.uid = uid;
3373         return bpf_iter_run_prog(prog, &ctx);
3374 }
3375
3376 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3377
3378 {
3379         struct bpf_unix_iter_state *iter = seq->private;
3380         unsigned int expected = 1;
3381         struct sock *sk;
3382
3383         sock_hold(start_sk);
3384         iter->batch[iter->end_sk++] = start_sk;
3385
3386         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3387                 if (iter->end_sk < iter->max_sk) {
3388                         sock_hold(sk);
3389                         iter->batch[iter->end_sk++] = sk;
3390                 }
3391
3392                 expected++;
3393         }
3394
3395         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3396
3397         return expected;
3398 }
3399
3400 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3401 {
3402         while (iter->cur_sk < iter->end_sk)
3403                 sock_put(iter->batch[iter->cur_sk++]);
3404 }
3405
3406 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3407                                        unsigned int new_batch_sz)
3408 {
3409         struct sock **new_batch;
3410
3411         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3412                              GFP_USER | __GFP_NOWARN);
3413         if (!new_batch)
3414                 return -ENOMEM;
3415
3416         bpf_iter_unix_put_batch(iter);
3417         kvfree(iter->batch);
3418         iter->batch = new_batch;
3419         iter->max_sk = new_batch_sz;
3420
3421         return 0;
3422 }
3423
3424 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3425                                         loff_t *pos)
3426 {
3427         struct bpf_unix_iter_state *iter = seq->private;
3428         unsigned int expected;
3429         bool resized = false;
3430         struct sock *sk;
3431
3432         if (iter->st_bucket_done)
3433                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3434
3435 again:
3436         /* Get a new batch */
3437         iter->cur_sk = 0;
3438         iter->end_sk = 0;
3439
3440         sk = unix_get_first(seq, pos);
3441         if (!sk)
3442                 return NULL; /* Done */
3443
3444         expected = bpf_iter_unix_hold_batch(seq, sk);
3445
3446         if (iter->end_sk == expected) {
3447                 iter->st_bucket_done = true;
3448                 return sk;
3449         }
3450
3451         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3452                 resized = true;
3453                 goto again;
3454         }
3455
3456         return sk;
3457 }
3458
3459 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3460 {
3461         if (!*pos)
3462                 return SEQ_START_TOKEN;
3463
3464         /* bpf iter does not support lseek, so it always
3465          * continue from where it was stop()-ped.
3466          */
3467         return bpf_iter_unix_batch(seq, pos);
3468 }
3469
3470 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3471 {
3472         struct bpf_unix_iter_state *iter = seq->private;
3473         struct sock *sk;
3474
3475         /* Whenever seq_next() is called, the iter->cur_sk is
3476          * done with seq_show(), so advance to the next sk in
3477          * the batch.
3478          */
3479         if (iter->cur_sk < iter->end_sk)
3480                 sock_put(iter->batch[iter->cur_sk++]);
3481
3482         ++*pos;
3483
3484         if (iter->cur_sk < iter->end_sk)
3485                 sk = iter->batch[iter->cur_sk];
3486         else
3487                 sk = bpf_iter_unix_batch(seq, pos);
3488
3489         return sk;
3490 }
3491
3492 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3493 {
3494         struct bpf_iter_meta meta;
3495         struct bpf_prog *prog;
3496         struct sock *sk = v;
3497         uid_t uid;
3498         bool slow;
3499         int ret;
3500
3501         if (v == SEQ_START_TOKEN)
3502                 return 0;
3503
3504         slow = lock_sock_fast(sk);
3505
3506         if (unlikely(sk_unhashed(sk))) {
3507                 ret = SEQ_SKIP;
3508                 goto unlock;
3509         }
3510
3511         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3512         meta.seq = seq;
3513         prog = bpf_iter_get_info(&meta, false);
3514         ret = unix_prog_seq_show(prog, &meta, v, uid);
3515 unlock:
3516         unlock_sock_fast(sk, slow);
3517         return ret;
3518 }
3519
3520 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3521 {
3522         struct bpf_unix_iter_state *iter = seq->private;
3523         struct bpf_iter_meta meta;
3524         struct bpf_prog *prog;
3525
3526         if (!v) {
3527                 meta.seq = seq;
3528                 prog = bpf_iter_get_info(&meta, true);
3529                 if (prog)
3530                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3531         }
3532
3533         if (iter->cur_sk < iter->end_sk)
3534                 bpf_iter_unix_put_batch(iter);
3535 }
3536
3537 static const struct seq_operations bpf_iter_unix_seq_ops = {
3538         .start  = bpf_iter_unix_seq_start,
3539         .next   = bpf_iter_unix_seq_next,
3540         .stop   = bpf_iter_unix_seq_stop,
3541         .show   = bpf_iter_unix_seq_show,
3542 };
3543 #endif
3544 #endif
3545
3546 static const struct net_proto_family unix_family_ops = {
3547         .family = PF_UNIX,
3548         .create = unix_create,
3549         .owner  = THIS_MODULE,
3550 };
3551
3552
3553 static int __net_init unix_net_init(struct net *net)
3554 {
3555         int i;
3556
3557         net->unx.sysctl_max_dgram_qlen = 10;
3558         if (unix_sysctl_register(net))
3559                 goto out;
3560
3561 #ifdef CONFIG_PROC_FS
3562         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3563                              sizeof(struct seq_net_private)))
3564                 goto err_sysctl;
3565 #endif
3566
3567         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3568                                               sizeof(spinlock_t), GFP_KERNEL);
3569         if (!net->unx.table.locks)
3570                 goto err_proc;
3571
3572         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3573                                                 sizeof(struct hlist_head),
3574                                                 GFP_KERNEL);
3575         if (!net->unx.table.buckets)
3576                 goto free_locks;
3577
3578         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3579                 spin_lock_init(&net->unx.table.locks[i]);
3580                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3581         }
3582
3583         return 0;
3584
3585 free_locks:
3586         kvfree(net->unx.table.locks);
3587 err_proc:
3588 #ifdef CONFIG_PROC_FS
3589         remove_proc_entry("unix", net->proc_net);
3590 err_sysctl:
3591 #endif
3592         unix_sysctl_unregister(net);
3593 out:
3594         return -ENOMEM;
3595 }
3596
3597 static void __net_exit unix_net_exit(struct net *net)
3598 {
3599         kvfree(net->unx.table.buckets);
3600         kvfree(net->unx.table.locks);
3601         unix_sysctl_unregister(net);
3602         remove_proc_entry("unix", net->proc_net);
3603 }
3604
3605 static struct pernet_operations unix_net_ops = {
3606         .init = unix_net_init,
3607         .exit = unix_net_exit,
3608 };
3609
3610 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3611 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3612                      struct unix_sock *unix_sk, uid_t uid)
3613
3614 #define INIT_BATCH_SZ 16
3615
3616 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3617 {
3618         struct bpf_unix_iter_state *iter = priv_data;
3619         int err;
3620
3621         err = bpf_iter_init_seq_net(priv_data, aux);
3622         if (err)
3623                 return err;
3624
3625         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3626         if (err) {
3627                 bpf_iter_fini_seq_net(priv_data);
3628                 return err;
3629         }
3630
3631         return 0;
3632 }
3633
3634 static void bpf_iter_fini_unix(void *priv_data)
3635 {
3636         struct bpf_unix_iter_state *iter = priv_data;
3637
3638         bpf_iter_fini_seq_net(priv_data);
3639         kvfree(iter->batch);
3640 }
3641
3642 static const struct bpf_iter_seq_info unix_seq_info = {
3643         .seq_ops                = &bpf_iter_unix_seq_ops,
3644         .init_seq_private       = bpf_iter_init_unix,
3645         .fini_seq_private       = bpf_iter_fini_unix,
3646         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3647 };
3648
3649 static const struct bpf_func_proto *
3650 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3651                              const struct bpf_prog *prog)
3652 {
3653         switch (func_id) {
3654         case BPF_FUNC_setsockopt:
3655                 return &bpf_sk_setsockopt_proto;
3656         case BPF_FUNC_getsockopt:
3657                 return &bpf_sk_getsockopt_proto;
3658         default:
3659                 return NULL;
3660         }
3661 }
3662
3663 static struct bpf_iter_reg unix_reg_info = {
3664         .target                 = "unix",
3665         .ctx_arg_info_size      = 1,
3666         .ctx_arg_info           = {
3667                 { offsetof(struct bpf_iter__unix, unix_sk),
3668                   PTR_TO_BTF_ID_OR_NULL },
3669         },
3670         .get_func_proto         = bpf_iter_unix_get_func_proto,
3671         .seq_info               = &unix_seq_info,
3672 };
3673
3674 static void __init bpf_iter_register(void)
3675 {
3676         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3677         if (bpf_iter_reg_target(&unix_reg_info))
3678                 pr_warn("Warning: could not register bpf iterator unix\n");
3679 }
3680 #endif
3681
3682 static int __init af_unix_init(void)
3683 {
3684         int i, rc = -1;
3685
3686         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3687
3688         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3689                 spin_lock_init(&bsd_socket_locks[i]);
3690                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3691         }
3692
3693         rc = proto_register(&unix_dgram_proto, 1);
3694         if (rc != 0) {
3695                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3696                 goto out;
3697         }
3698
3699         rc = proto_register(&unix_stream_proto, 1);
3700         if (rc != 0) {
3701                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3702                 proto_unregister(&unix_dgram_proto);
3703                 goto out;
3704         }
3705
3706         sock_register(&unix_family_ops);
3707         register_pernet_subsys(&unix_net_ops);
3708         unix_bpf_build_proto();
3709
3710 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711         bpf_iter_register();
3712 #endif
3713
3714 out:
3715         return rc;
3716 }
3717
3718 /* Later than subsys_initcall() because we depend on stuff initialised there */
3719 fs_initcall(af_unix_init);
This page took 0.246473 seconds and 4 git commands to generate.