net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <[email protected]>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119 #include <linux/bpf-cgroup.h>
 120
 121 #include "scm.h"
 122
 123 static atomic_long_t unix_nr_socks;
 124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 126
 127 /* SMP locking strategy:
 128  *    hash table is protected with spinlock.
 129  *    each socket state is protected by separate spinlock.
 130  */
 131
 132 static unsigned int unix_unbound_hash(struct sock *sk)
 133 {
 134         unsigned long hash = (unsigned long)sk;
 135
 136         hash ^= hash >> 16;
 137         hash ^= hash >> 8;
 138         hash ^= sk->sk_type;
 139
 140         return hash & UNIX_HASH_MOD;
 141 }
 142
 143 static unsigned int unix_bsd_hash(struct inode *i)
 144 {
 145         return i->i_ino & UNIX_HASH_MOD;
 146 }
 147
 148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 149                                        int addr_len, int type)
 150 {
 151         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 152         unsigned int hash;
 153
 154         hash = (__force unsigned int)csum_fold(csum);
 155         hash ^= hash >> 8;
 156         hash ^= type;
 157
 158         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 159 }
 160
 161 static void unix_table_double_lock(struct net *net,
 162                                    unsigned int hash1, unsigned int hash2)
 163 {
 164         if (hash1 == hash2) {
 165                 spin_lock(&net->unx.table.locks[hash1]);
 166                 return;
 167         }
 168
 169         if (hash1 > hash2)
 170                 swap(hash1, hash2);
 171
 172         spin_lock(&net->unx.table.locks[hash1]);
 173         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 174 }
 175
 176 static void unix_table_double_unlock(struct net *net,
 177                                      unsigned int hash1, unsigned int hash2)
 178 {
 179         if (hash1 == hash2) {
 180                 spin_unlock(&net->unx.table.locks[hash1]);
 181                 return;
 182         }
 183
 184         spin_unlock(&net->unx.table.locks[hash1]);
 185         spin_unlock(&net->unx.table.locks[hash2]);
 186 }
 187
 188 #ifdef CONFIG_SECURITY_NETWORK
 189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 190 {
 191         UNIXCB(skb).secid = scm->secid;
 192 }
 193
 194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 195 {
 196         scm->secid = UNIXCB(skb).secid;
 197 }
 198
 199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 200 {
 201         return (scm->secid == UNIXCB(skb).secid);
 202 }
 203 #else
 204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 205 { }
 206
 207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 208 { }
 209
 210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 211 {
 212         return true;
 213 }
 214 #endif /* CONFIG_SECURITY_NETWORK */
 215
 216 #define unix_peer(sk) (unix_sk(sk)->peer)
 217
 218 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 219 {
 220         return unix_peer(osk) == sk;
 221 }
 222
 223 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 224 {
 225         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 226 }
 227
 228 static inline int unix_recvq_full(const struct sock *sk)
 229 {
 230         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 231 }
 232
 233 static inline int unix_recvq_full_lockless(const struct sock *sk)
 234 {
 235         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 236                 READ_ONCE(sk->sk_max_ack_backlog);
 237 }
 238
 239 struct sock *unix_peer_get(struct sock *s)
 240 {
 241         struct sock *peer;
 242
 243         unix_state_lock(s);
 244         peer = unix_peer(s);
 245         if (peer)
 246                 sock_hold(peer);
 247         unix_state_unlock(s);
 248         return peer;
 249 }
 250 EXPORT_SYMBOL_GPL(unix_peer_get);
 251
 252 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 253                                              int addr_len)
 254 {
 255         struct unix_address *addr;
 256
 257         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 258         if (!addr)
 259                 return NULL;
 260
 261         refcount_set(&addr->refcnt, 1);
 262         addr->len = addr_len;
 263         memcpy(addr->name, sunaddr, addr_len);
 264
 265         return addr;
 266 }
 267
 268 static inline void unix_release_addr(struct unix_address *addr)
 269 {
 270         if (refcount_dec_and_test(&addr->refcnt))
 271                 kfree(addr);
 272 }
 273
 274 /*
 275  *      Check unix socket name:
 276  *              - should be not zero length.
 277  *              - if started by not zero, should be NULL terminated (FS object)
 278  *              - if started by zero, it is abstract name.
 279  */
 280
 281 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 282 {
 283         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 284             addr_len > sizeof(*sunaddr))
 285                 return -EINVAL;
 286
 287         if (sunaddr->sun_family != AF_UNIX)
 288                 return -EINVAL;
 289
 290         return 0;
 291 }
 292
 293 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 294 {
 295         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 296         short offset = offsetof(struct sockaddr_storage, __data);
 297
 298         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 299
 300         /* This may look like an off by one error but it is a bit more
 301          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 302          * sun_path[108] doesn't as such exist.  However in kernel space
 303          * we are guaranteed that it is a valid memory location in our
 304          * kernel address buffer because syscall functions always pass
 305          * a pointer of struct sockaddr_storage which has a bigger buffer
 306          * than 108.  Also, we must terminate sun_path for strlen() in
 307          * getname_kernel().
 308          */
 309         addr->__data[addr_len - offset] = 0;
 310
 311         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 312          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 313          * know the actual buffer.
 314          */
 315         return strlen(addr->__data) + offset + 1;
 316 }
 317
 318 static void __unix_remove_socket(struct sock *sk)
 319 {
 320         sk_del_node_init(sk);
 321 }
 322
 323 static void __unix_insert_socket(struct net *net, struct sock *sk)
 324 {
 325         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 326         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 327 }
 328
 329 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 330                                  struct unix_address *addr, unsigned int hash)
 331 {
 332         __unix_remove_socket(sk);
 333         smp_store_release(&unix_sk(sk)->addr, addr);
 334
 335         sk->sk_hash = hash;
 336         __unix_insert_socket(net, sk);
 337 }
 338
 339 static void unix_remove_socket(struct net *net, struct sock *sk)
 340 {
 341         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 342         __unix_remove_socket(sk);
 343         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 344 }
 345
 346 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 347 {
 348         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 349         __unix_insert_socket(net, sk);
 350         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 351 }
 352
 353 static void unix_insert_bsd_socket(struct sock *sk)
 354 {
 355         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 356         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 357         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 358 }
 359
 360 static void unix_remove_bsd_socket(struct sock *sk)
 361 {
 362         if (!hlist_unhashed(&sk->sk_bind_node)) {
 363                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 364                 __sk_del_bind_node(sk);
 365                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 366
 367                 sk_node_init(&sk->sk_bind_node);
 368         }
 369 }
 370
 371 static struct sock *__unix_find_socket_byname(struct net *net,
 372                                               struct sockaddr_un *sunname,
 373                                               int len, unsigned int hash)
 374 {
 375         struct sock *s;
 376
 377         sk_for_each(s, &net->unx.table.buckets[hash]) {
 378                 struct unix_sock *u = unix_sk(s);
 379
 380                 if (u->addr->len == len &&
 381                     !memcmp(u->addr->name, sunname, len))
 382                         return s;
 383         }
 384         return NULL;
 385 }
 386
 387 static inline struct sock *unix_find_socket_byname(struct net *net,
 388                                                    struct sockaddr_un *sunname,
 389                                                    int len, unsigned int hash)
 390 {
 391         struct sock *s;
 392
 393         spin_lock(&net->unx.table.locks[hash]);
 394         s = __unix_find_socket_byname(net, sunname, len, hash);
 395         if (s)
 396                 sock_hold(s);
 397         spin_unlock(&net->unx.table.locks[hash]);
 398         return s;
 399 }
 400
 401 static struct sock *unix_find_socket_byinode(struct inode *i)
 402 {
 403         unsigned int hash = unix_bsd_hash(i);
 404         struct sock *s;
 405
 406         spin_lock(&bsd_socket_locks[hash]);
 407         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 408                 struct dentry *dentry = unix_sk(s)->path.dentry;
 409
 410                 if (dentry && d_backing_inode(dentry) == i) {
 411                         sock_hold(s);
 412                         spin_unlock(&bsd_socket_locks[hash]);
 413                         return s;
 414                 }
 415         }
 416         spin_unlock(&bsd_socket_locks[hash]);
 417         return NULL;
 418 }
 419
 420 /* Support code for asymmetrically connected dgram sockets
 421  *
 422  * If a datagram socket is connected to a socket not itself connected
 423  * to the first socket (eg, /dev/log), clients may only enqueue more
 424  * messages if the present receive queue of the server socket is not
 425  * "too large". This means there's a second writeability condition
 426  * poll and sendmsg need to test. The dgram recv code will do a wake
 427  * up on the peer_wait wait queue of a socket upon reception of a
 428  * datagram which needs to be propagated to sleeping would-be writers
 429  * since these might not have sent anything so far. This can't be
 430  * accomplished via poll_wait because the lifetime of the server
 431  * socket might be less than that of its clients if these break their
 432  * association with it or if the server socket is closed while clients
 433  * are still connected to it and there's no way to inform "a polling
 434  * implementation" that it should let go of a certain wait queue
 435  *
 436  * In order to propagate a wake up, a wait_queue_entry_t of the client
 437  * socket is enqueued on the peer_wait queue of the server socket
 438  * whose wake function does a wake_up on the ordinary client socket
 439  * wait queue. This connection is established whenever a write (or
 440  * poll for write) hit the flow control condition and broken when the
 441  * association to the server socket is dissolved or after a wake up
 442  * was relayed.
 443  */
 444
 445 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 446                                       void *key)
 447 {
 448         struct unix_sock *u;
 449         wait_queue_head_t *u_sleep;
 450
 451         u = container_of(q, struct unix_sock, peer_wake);
 452
 453         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 454                             q);
 455         u->peer_wake.private = NULL;
 456
 457         /* relaying can only happen while the wq still exists */
 458         u_sleep = sk_sleep(&u->sk);
 459         if (u_sleep)
 460                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 461
 462         return 0;
 463 }
 464
 465 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 466 {
 467         struct unix_sock *u, *u_other;
 468         int rc;
 469
 470         u = unix_sk(sk);
 471         u_other = unix_sk(other);
 472         rc = 0;
 473         spin_lock(&u_other->peer_wait.lock);
 474
 475         if (!u->peer_wake.private) {
 476                 u->peer_wake.private = other;
 477                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 478
 479                 rc = 1;
 480         }
 481
 482         spin_unlock(&u_other->peer_wait.lock);
 483         return rc;
 484 }
 485
 486 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 487                                             struct sock *other)
 488 {
 489         struct unix_sock *u, *u_other;
 490
 491         u = unix_sk(sk);
 492         u_other = unix_sk(other);
 493         spin_lock(&u_other->peer_wait.lock);
 494
 495         if (u->peer_wake.private == other) {
 496                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 497                 u->peer_wake.private = NULL;
 498         }
 499
 500         spin_unlock(&u_other->peer_wait.lock);
 501 }
 502
 503 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 504                                                    struct sock *other)
 505 {
 506         unix_dgram_peer_wake_disconnect(sk, other);
 507         wake_up_interruptible_poll(sk_sleep(sk),
 508                                    EPOLLOUT |
 509                                    EPOLLWRNORM |
 510                                    EPOLLWRBAND);
 511 }
 512
 513 /* preconditions:
 514  *      - unix_peer(sk) == other
 515  *      - association is stable
 516  */
 517 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 518 {
 519         int connected;
 520
 521         connected = unix_dgram_peer_wake_connect(sk, other);
 522
 523         /* If other is SOCK_DEAD, we want to make sure we signal
 524          * POLLOUT, such that a subsequent write() can get a
 525          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 526          * to other and its full, we will hang waiting for POLLOUT.
 527          */
 528         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 529                 return 1;
 530
 531         if (connected)
 532                 unix_dgram_peer_wake_disconnect(sk, other);
 533
 534         return 0;
 535 }
 536
 537 static int unix_writable(const struct sock *sk)
 538 {
 539         return sk->sk_state != TCP_LISTEN &&
 540                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 541 }
 542
 543 static void unix_write_space(struct sock *sk)
 544 {
 545         struct socket_wq *wq;
 546
 547         rcu_read_lock();
 548         if (unix_writable(sk)) {
 549                 wq = rcu_dereference(sk->sk_wq);
 550                 if (skwq_has_sleeper(wq))
 551                         wake_up_interruptible_sync_poll(&wq->wait,
 552                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 553                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 554         }
 555         rcu_read_unlock();
 556 }
 557
 558 /* When dgram socket disconnects (or changes its peer), we clear its receive
 559  * queue of packets arrived from previous peer. First, it allows to do
 560  * flow control based only on wmem_alloc; second, sk connected to peer
 561  * may receive messages only from that peer. */
 562 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 563 {
 564         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 565                 skb_queue_purge(&sk->sk_receive_queue);
 566                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 567
 568                 /* If one link of bidirectional dgram pipe is disconnected,
 569                  * we signal error. Messages are lost. Do not make this,
 570                  * when peer was not connected to us.
 571                  */
 572                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 573                         WRITE_ONCE(other->sk_err, ECONNRESET);
 574                         sk_error_report(other);
 575                 }
 576         }
 577         other->sk_state = TCP_CLOSE;
 578 }
 579
 580 static void unix_sock_destructor(struct sock *sk)
 581 {
 582         struct unix_sock *u = unix_sk(sk);
 583
 584         skb_queue_purge(&sk->sk_receive_queue);
 585
 586         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 587         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 588         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 589         if (!sock_flag(sk, SOCK_DEAD)) {
 590                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 591                 return;
 592         }
 593
 594         if (u->addr)
 595                 unix_release_addr(u->addr);
 596
 597         atomic_long_dec(&unix_nr_socks);
 598         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 599 #ifdef UNIX_REFCNT_DEBUG
 600         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 601                 atomic_long_read(&unix_nr_socks));
 602 #endif
 603 }
 604
 605 static void unix_release_sock(struct sock *sk, int embrion)
 606 {
 607         struct unix_sock *u = unix_sk(sk);
 608         struct sock *skpair;
 609         struct sk_buff *skb;
 610         struct path path;
 611         int state;
 612
 613         unix_remove_socket(sock_net(sk), sk);
 614         unix_remove_bsd_socket(sk);
 615
 616         /* Clear state */
 617         unix_state_lock(sk);
 618         sock_orphan(sk);
 619         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 620         path         = u->path;
 621         u->path.dentry = NULL;
 622         u->path.mnt = NULL;
 623         state = sk->sk_state;
 624         sk->sk_state = TCP_CLOSE;
 625
 626         skpair = unix_peer(sk);
 627         unix_peer(sk) = NULL;
 628
 629         unix_state_unlock(sk);
 630
 631 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 632         if (u->oob_skb) {
 633                 kfree_skb(u->oob_skb);
 634                 u->oob_skb = NULL;
 635         }
 636 #endif
 637
 638         wake_up_interruptible_all(&u->peer_wait);
 639
 640         if (skpair != NULL) {
 641                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 642                         unix_state_lock(skpair);
 643                         /* No more writes */
 644                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 645                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 646                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 647                         unix_state_unlock(skpair);
 648                         skpair->sk_state_change(skpair);
 649                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 650                 }
 651
 652                 unix_dgram_peer_wake_disconnect(sk, skpair);
 653                 sock_put(skpair); /* It may now die */
 654         }
 655
 656         /* Try to flush out this socket. Throw out buffers at least */
 657
 658         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 659                 if (state == TCP_LISTEN)
 660                         unix_release_sock(skb->sk, 1);
 661                 /* passed fds are erased in the kfree_skb hook        */
 662                 UNIXCB(skb).consumed = skb->len;
 663                 kfree_skb(skb);
 664         }
 665
 666         if (path.dentry)
 667                 path_put(&path);
 668
 669         sock_put(sk);
 670
 671         /* ---- Socket is dead now and most probably destroyed ---- */
 672
 673         /*
 674          * Fixme: BSD difference: In BSD all sockets connected to us get
 675          *        ECONNRESET and we die on the spot. In Linux we behave
 676          *        like files and pipes do and wait for the last
 677          *        dereference.
 678          *
 679          * Can't we simply set sock->err?
 680          *
 681          *        What the above comment does talk about? --ANK(980817)
 682          */
 683
 684         if (READ_ONCE(unix_tot_inflight))
 685                 unix_gc();              /* Garbage collect fds */
 686 }
 687
 688 static void init_peercred(struct sock *sk)
 689 {
 690         const struct cred *old_cred;
 691         struct pid *old_pid;
 692
 693         spin_lock(&sk->sk_peer_lock);
 694         old_pid = sk->sk_peer_pid;
 695         old_cred = sk->sk_peer_cred;
 696         sk->sk_peer_pid  = get_pid(task_tgid(current));
 697         sk->sk_peer_cred = get_current_cred();
 698         spin_unlock(&sk->sk_peer_lock);
 699
 700         put_pid(old_pid);
 701         put_cred(old_cred);
 702 }
 703
 704 static void copy_peercred(struct sock *sk, struct sock *peersk)
 705 {
 706         const struct cred *old_cred;
 707         struct pid *old_pid;
 708
 709         if (sk < peersk) {
 710                 spin_lock(&sk->sk_peer_lock);
 711                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 712         } else {
 713                 spin_lock(&peersk->sk_peer_lock);
 714                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 715         }
 716         old_pid = sk->sk_peer_pid;
 717         old_cred = sk->sk_peer_cred;
 718         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 719         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 720
 721         spin_unlock(&sk->sk_peer_lock);
 722         spin_unlock(&peersk->sk_peer_lock);
 723
 724         put_pid(old_pid);
 725         put_cred(old_cred);
 726 }
 727
 728 static int unix_listen(struct socket *sock, int backlog)
 729 {
 730         int err;
 731         struct sock *sk = sock->sk;
 732         struct unix_sock *u = unix_sk(sk);
 733
 734         err = -EOPNOTSUPP;
 735         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 736                 goto out;       /* Only stream/seqpacket sockets accept */
 737         err = -EINVAL;
 738         if (!u->addr)
 739                 goto out;       /* No listens on an unbound socket */
 740         unix_state_lock(sk);
 741         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 742                 goto out_unlock;
 743         if (backlog > sk->sk_max_ack_backlog)
 744                 wake_up_interruptible_all(&u->peer_wait);
 745         sk->sk_max_ack_backlog  = backlog;
 746         sk->sk_state            = TCP_LISTEN;
 747         /* set credentials so connect can copy them */
 748         init_peercred(sk);
 749         err = 0;
 750
 751 out_unlock:
 752         unix_state_unlock(sk);
 753 out:
 754         return err;
 755 }
 756
 757 static int unix_release(struct socket *);
 758 static int unix_bind(struct socket *, struct sockaddr *, int);
 759 static int unix_stream_connect(struct socket *, struct sockaddr *,
 760                                int addr_len, int flags);
 761 static int unix_socketpair(struct socket *, struct socket *);
 762 static int unix_accept(struct socket *, struct socket *, int, bool);
 763 static int unix_getname(struct socket *, struct sockaddr *, int);
 764 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 765 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 766                                     poll_table *);
 767 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 768 #ifdef CONFIG_COMPAT
 769 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 770 #endif
 771 static int unix_shutdown(struct socket *, int);
 772 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 773 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 774 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 775                                        struct pipe_inode_info *, size_t size,
 776                                        unsigned int flags);
 777 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 778 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 779 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 780 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 781 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 782                               int, int);
 783 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 784 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 785                                   int);
 786
 787 static int unix_set_peek_off(struct sock *sk, int val)
 788 {
 789         struct unix_sock *u = unix_sk(sk);
 790
 791         if (mutex_lock_interruptible(&u->iolock))
 792                 return -EINTR;
 793
 794         WRITE_ONCE(sk->sk_peek_off, val);
 795         mutex_unlock(&u->iolock);
 796
 797         return 0;
 798 }
 799
 800 #ifdef CONFIG_PROC_FS
 801 static int unix_count_nr_fds(struct sock *sk)
 802 {
 803         struct sk_buff *skb;
 804         struct unix_sock *u;
 805         int nr_fds = 0;
 806
 807         spin_lock(&sk->sk_receive_queue.lock);
 808         skb = skb_peek(&sk->sk_receive_queue);
 809         while (skb) {
 810                 u = unix_sk(skb->sk);
 811                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 812                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 813         }
 814         spin_unlock(&sk->sk_receive_queue.lock);
 815
 816         return nr_fds;
 817 }
 818
 819 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 820 {
 821         struct sock *sk = sock->sk;
 822         unsigned char s_state;
 823         struct unix_sock *u;
 824         int nr_fds = 0;
 825
 826         if (sk) {
 827                 s_state = READ_ONCE(sk->sk_state);
 828                 u = unix_sk(sk);
 829
 830                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 831                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 832                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 833                  */
 834                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 835                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 836                 else if (s_state == TCP_LISTEN)
 837                         nr_fds = unix_count_nr_fds(sk);
 838
 839                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 840         }
 841 }
 842 #else
 843 #define unix_show_fdinfo NULL
 844 #endif
 845
 846 static const struct proto_ops unix_stream_ops = {
 847         .family =       PF_UNIX,
 848         .owner =        THIS_MODULE,
 849         .release =      unix_release,
 850         .bind =         unix_bind,
 851         .connect =      unix_stream_connect,
 852         .socketpair =   unix_socketpair,
 853         .accept =       unix_accept,
 854         .getname =      unix_getname,
 855         .poll =         unix_poll,
 856         .ioctl =        unix_ioctl,
 857 #ifdef CONFIG_COMPAT
 858         .compat_ioctl = unix_compat_ioctl,
 859 #endif
 860         .listen =       unix_listen,
 861         .shutdown =     unix_shutdown,
 862         .sendmsg =      unix_stream_sendmsg,
 863         .recvmsg =      unix_stream_recvmsg,
 864         .read_skb =     unix_stream_read_skb,
 865         .mmap =         sock_no_mmap,
 866         .splice_read =  unix_stream_splice_read,
 867         .set_peek_off = unix_set_peek_off,
 868         .show_fdinfo =  unix_show_fdinfo,
 869 };
 870
 871 static const struct proto_ops unix_dgram_ops = {
 872         .family =       PF_UNIX,
 873         .owner =        THIS_MODULE,
 874         .release =      unix_release,
 875         .bind =         unix_bind,
 876         .connect =      unix_dgram_connect,
 877         .socketpair =   unix_socketpair,
 878         .accept =       sock_no_accept,
 879         .getname =      unix_getname,
 880         .poll =         unix_dgram_poll,
 881         .ioctl =        unix_ioctl,
 882 #ifdef CONFIG_COMPAT
 883         .compat_ioctl = unix_compat_ioctl,
 884 #endif
 885         .listen =       sock_no_listen,
 886         .shutdown =     unix_shutdown,
 887         .sendmsg =      unix_dgram_sendmsg,
 888         .read_skb =     unix_read_skb,
 889         .recvmsg =      unix_dgram_recvmsg,
 890         .mmap =         sock_no_mmap,
 891         .set_peek_off = unix_set_peek_off,
 892         .show_fdinfo =  unix_show_fdinfo,
 893 };
 894
 895 static const struct proto_ops unix_seqpacket_ops = {
 896         .family =       PF_UNIX,
 897         .owner =        THIS_MODULE,
 898         .release =      unix_release,
 899         .bind =         unix_bind,
 900         .connect =      unix_stream_connect,
 901         .socketpair =   unix_socketpair,
 902         .accept =       unix_accept,
 903         .getname =      unix_getname,
 904         .poll =         unix_dgram_poll,
 905         .ioctl =        unix_ioctl,
 906 #ifdef CONFIG_COMPAT
 907         .compat_ioctl = unix_compat_ioctl,
 908 #endif
 909         .listen =       unix_listen,
 910         .shutdown =     unix_shutdown,
 911         .sendmsg =      unix_seqpacket_sendmsg,
 912         .recvmsg =      unix_seqpacket_recvmsg,
 913         .mmap =         sock_no_mmap,
 914         .set_peek_off = unix_set_peek_off,
 915         .show_fdinfo =  unix_show_fdinfo,
 916 };
 917
 918 static void unix_close(struct sock *sk, long timeout)
 919 {
 920         /* Nothing to do here, unix socket does not need a ->close().
 921          * This is merely for sockmap.
 922          */
 923 }
 924
 925 static void unix_unhash(struct sock *sk)
 926 {
 927         /* Nothing to do here, unix socket does not need a ->unhash().
 928          * This is merely for sockmap.
 929          */
 930 }
 931
 932 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 933 {
 934         if (level == SOL_SOCKET) {
 935                 switch (optname) {
 936                 case SO_PEERPIDFD:
 937                         return true;
 938                 default:
 939                         return false;
 940                 }
 941         }
 942
 943         return false;
 944 }
 945
 946 struct proto unix_dgram_proto = {
 947         .name                   = "UNIX",
 948         .owner                  = THIS_MODULE,
 949         .obj_size               = sizeof(struct unix_sock),
 950         .close                  = unix_close,
 951         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 952 #ifdef CONFIG_BPF_SYSCALL
 953         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 954 #endif
 955 };
 956
 957 struct proto unix_stream_proto = {
 958         .name                   = "UNIX-STREAM",
 959         .owner                  = THIS_MODULE,
 960         .obj_size               = sizeof(struct unix_sock),
 961         .close                  = unix_close,
 962         .unhash                 = unix_unhash,
 963         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 964 #ifdef CONFIG_BPF_SYSCALL
 965         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 966 #endif
 967 };
 968
 969 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 970 {
 971         struct unix_sock *u;
 972         struct sock *sk;
 973         int err;
 974
 975         atomic_long_inc(&unix_nr_socks);
 976         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 977                 err = -ENFILE;
 978                 goto err;
 979         }
 980
 981         if (type == SOCK_STREAM)
 982                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 983         else /*dgram and  seqpacket */
 984                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 985
 986         if (!sk) {
 987                 err = -ENOMEM;
 988                 goto err;
 989         }
 990
 991         sock_init_data(sock, sk);
 992
 993         sk->sk_hash             = unix_unbound_hash(sk);
 994         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 995         sk->sk_write_space      = unix_write_space;
 996         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 997         sk->sk_destruct         = unix_sock_destructor;
 998         u         = unix_sk(sk);
 999         u->path.dentry = NULL;
1000         u->path.mnt = NULL;
1001         spin_lock_init(&u->lock);
1002         atomic_long_set(&u->inflight, 0);
1003         INIT_LIST_HEAD(&u->link);
1004         mutex_init(&u->iolock); /* single task reading lock */
1005         mutex_init(&u->bindlock); /* single task binding lock */
1006         init_waitqueue_head(&u->peer_wait);
1007         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1008         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1009         unix_insert_unbound_socket(net, sk);
1010
1011         sock_prot_inuse_add(net, sk->sk_prot, 1);
1012
1013         return sk;
1014
1015 err:
1016         atomic_long_dec(&unix_nr_socks);
1017         return ERR_PTR(err);
1018 }
1019
1020 static int unix_create(struct net *net, struct socket *sock, int protocol,
1021                        int kern)
1022 {
1023         struct sock *sk;
1024
1025         if (protocol && protocol != PF_UNIX)
1026                 return -EPROTONOSUPPORT;
1027
1028         sock->state = SS_UNCONNECTED;
1029
1030         switch (sock->type) {
1031         case SOCK_STREAM:
1032                 sock->ops = &unix_stream_ops;
1033                 break;
1034                 /*
1035                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1036                  *      nothing uses it.
1037                  */
1038         case SOCK_RAW:
1039                 sock->type = SOCK_DGRAM;
1040                 fallthrough;
1041         case SOCK_DGRAM:
1042                 sock->ops = &unix_dgram_ops;
1043                 break;
1044         case SOCK_SEQPACKET:
1045                 sock->ops = &unix_seqpacket_ops;
1046                 break;
1047         default:
1048                 return -ESOCKTNOSUPPORT;
1049         }
1050
1051         sk = unix_create1(net, sock, kern, sock->type);
1052         if (IS_ERR(sk))
1053                 return PTR_ERR(sk);
1054
1055         return 0;
1056 }
1057
1058 static int unix_release(struct socket *sock)
1059 {
1060         struct sock *sk = sock->sk;
1061
1062         if (!sk)
1063                 return 0;
1064
1065         sk->sk_prot->close(sk, 0);
1066         unix_release_sock(sk, 0);
1067         sock->sk = NULL;
1068
1069         return 0;
1070 }
1071
1072 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1073                                   int type)
1074 {
1075         struct inode *inode;
1076         struct path path;
1077         struct sock *sk;
1078         int err;
1079
1080         unix_mkname_bsd(sunaddr, addr_len);
1081         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1082         if (err)
1083                 goto fail;
1084
1085         err = path_permission(&path, MAY_WRITE);
1086         if (err)
1087                 goto path_put;
1088
1089         err = -ECONNREFUSED;
1090         inode = d_backing_inode(path.dentry);
1091         if (!S_ISSOCK(inode->i_mode))
1092                 goto path_put;
1093
1094         sk = unix_find_socket_byinode(inode);
1095         if (!sk)
1096                 goto path_put;
1097
1098         err = -EPROTOTYPE;
1099         if (sk->sk_type == type)
1100                 touch_atime(&path);
1101         else
1102                 goto sock_put;
1103
1104         path_put(&path);
1105
1106         return sk;
1107
1108 sock_put:
1109         sock_put(sk);
1110 path_put:
1111         path_put(&path);
1112 fail:
1113         return ERR_PTR(err);
1114 }
1115
1116 static struct sock *unix_find_abstract(struct net *net,
1117                                        struct sockaddr_un *sunaddr,
1118                                        int addr_len, int type)
1119 {
1120         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1121         struct dentry *dentry;
1122         struct sock *sk;
1123
1124         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1125         if (!sk)
1126                 return ERR_PTR(-ECONNREFUSED);
1127
1128         dentry = unix_sk(sk)->path.dentry;
1129         if (dentry)
1130                 touch_atime(&unix_sk(sk)->path);
1131
1132         return sk;
1133 }
1134
1135 static struct sock *unix_find_other(struct net *net,
1136                                     struct sockaddr_un *sunaddr,
1137                                     int addr_len, int type)
1138 {
1139         struct sock *sk;
1140
1141         if (sunaddr->sun_path[0])
1142                 sk = unix_find_bsd(sunaddr, addr_len, type);
1143         else
1144                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1145
1146         return sk;
1147 }
1148
1149 static int unix_autobind(struct sock *sk)
1150 {
1151         unsigned int new_hash, old_hash = sk->sk_hash;
1152         struct unix_sock *u = unix_sk(sk);
1153         struct net *net = sock_net(sk);
1154         struct unix_address *addr;
1155         u32 lastnum, ordernum;
1156         int err;
1157
1158         err = mutex_lock_interruptible(&u->bindlock);
1159         if (err)
1160                 return err;
1161
1162         if (u->addr)
1163                 goto out;
1164
1165         err = -ENOMEM;
1166         addr = kzalloc(sizeof(*addr) +
1167                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1168         if (!addr)
1169                 goto out;
1170
1171         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1172         addr->name->sun_family = AF_UNIX;
1173         refcount_set(&addr->refcnt, 1);
1174
1175         ordernum = get_random_u32();
1176         lastnum = ordernum & 0xFFFFF;
1177 retry:
1178         ordernum = (ordernum + 1) & 0xFFFFF;
1179         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1180
1181         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1182         unix_table_double_lock(net, old_hash, new_hash);
1183
1184         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1185                 unix_table_double_unlock(net, old_hash, new_hash);
1186
1187                 /* __unix_find_socket_byname() may take long time if many names
1188                  * are already in use.
1189                  */
1190                 cond_resched();
1191
1192                 if (ordernum == lastnum) {
1193                         /* Give up if all names seems to be in use. */
1194                         err = -ENOSPC;
1195                         unix_release_addr(addr);
1196                         goto out;
1197                 }
1198
1199                 goto retry;
1200         }
1201
1202         __unix_set_addr_hash(net, sk, addr, new_hash);
1203         unix_table_double_unlock(net, old_hash, new_hash);
1204         err = 0;
1205
1206 out:    mutex_unlock(&u->bindlock);
1207         return err;
1208 }
1209
1210 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1211                          int addr_len)
1212 {
1213         umode_t mode = S_IFSOCK |
1214                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1215         unsigned int new_hash, old_hash = sk->sk_hash;
1216         struct unix_sock *u = unix_sk(sk);
1217         struct net *net = sock_net(sk);
1218         struct mnt_idmap *idmap;
1219         struct unix_address *addr;
1220         struct dentry *dentry;
1221         struct path parent;
1222         int err;
1223
1224         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1225         addr = unix_create_addr(sunaddr, addr_len);
1226         if (!addr)
1227                 return -ENOMEM;
1228
1229         /*
1230          * Get the parent directory, calculate the hash for last
1231          * component.
1232          */
1233         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1234         if (IS_ERR(dentry)) {
1235                 err = PTR_ERR(dentry);
1236                 goto out;
1237         }
1238
1239         /*
1240          * All right, let's create it.
1241          */
1242         idmap = mnt_idmap(parent.mnt);
1243         err = security_path_mknod(&parent, dentry, mode, 0);
1244         if (!err)
1245                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1246         if (err)
1247                 goto out_path;
1248         err = mutex_lock_interruptible(&u->bindlock);
1249         if (err)
1250                 goto out_unlink;
1251         if (u->addr)
1252                 goto out_unlock;
1253
1254         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1255         unix_table_double_lock(net, old_hash, new_hash);
1256         u->path.mnt = mntget(parent.mnt);
1257         u->path.dentry = dget(dentry);
1258         __unix_set_addr_hash(net, sk, addr, new_hash);
1259         unix_table_double_unlock(net, old_hash, new_hash);
1260         unix_insert_bsd_socket(sk);
1261         mutex_unlock(&u->bindlock);
1262         done_path_create(&parent, dentry);
1263         return 0;
1264
1265 out_unlock:
1266         mutex_unlock(&u->bindlock);
1267         err = -EINVAL;
1268 out_unlink:
1269         /* failed after successful mknod?  unlink what we'd created... */
1270         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1271 out_path:
1272         done_path_create(&parent, dentry);
1273 out:
1274         unix_release_addr(addr);
1275         return err == -EEXIST ? -EADDRINUSE : err;
1276 }
1277
1278 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1279                               int addr_len)
1280 {
1281         unsigned int new_hash, old_hash = sk->sk_hash;
1282         struct unix_sock *u = unix_sk(sk);
1283         struct net *net = sock_net(sk);
1284         struct unix_address *addr;
1285         int err;
1286
1287         addr = unix_create_addr(sunaddr, addr_len);
1288         if (!addr)
1289                 return -ENOMEM;
1290
1291         err = mutex_lock_interruptible(&u->bindlock);
1292         if (err)
1293                 goto out;
1294
1295         if (u->addr) {
1296                 err = -EINVAL;
1297                 goto out_mutex;
1298         }
1299
1300         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1301         unix_table_double_lock(net, old_hash, new_hash);
1302
1303         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1304                 goto out_spin;
1305
1306         __unix_set_addr_hash(net, sk, addr, new_hash);
1307         unix_table_double_unlock(net, old_hash, new_hash);
1308         mutex_unlock(&u->bindlock);
1309         return 0;
1310
1311 out_spin:
1312         unix_table_double_unlock(net, old_hash, new_hash);
1313         err = -EADDRINUSE;
1314 out_mutex:
1315         mutex_unlock(&u->bindlock);
1316 out:
1317         unix_release_addr(addr);
1318         return err;
1319 }
1320
1321 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1322 {
1323         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1324         struct sock *sk = sock->sk;
1325         int err;
1326
1327         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1328             sunaddr->sun_family == AF_UNIX)
1329                 return unix_autobind(sk);
1330
1331         err = unix_validate_addr(sunaddr, addr_len);
1332         if (err)
1333                 return err;
1334
1335         if (sunaddr->sun_path[0])
1336                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1337         else
1338                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1339
1340         return err;
1341 }
1342
1343 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1344 {
1345         if (unlikely(sk1 == sk2) || !sk2) {
1346                 unix_state_lock(sk1);
1347                 return;
1348         }
1349         if (sk1 < sk2) {
1350                 unix_state_lock(sk1);
1351                 unix_state_lock_nested(sk2);
1352         } else {
1353                 unix_state_lock(sk2);
1354                 unix_state_lock_nested(sk1);
1355         }
1356 }
1357
1358 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1359 {
1360         if (unlikely(sk1 == sk2) || !sk2) {
1361                 unix_state_unlock(sk1);
1362                 return;
1363         }
1364         unix_state_unlock(sk1);
1365         unix_state_unlock(sk2);
1366 }
1367
1368 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1369                               int alen, int flags)
1370 {
1371         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1372         struct sock *sk = sock->sk;
1373         struct sock *other;
1374         int err;
1375
1376         err = -EINVAL;
1377         if (alen < offsetofend(struct sockaddr, sa_family))
1378                 goto out;
1379
1380         if (addr->sa_family != AF_UNSPEC) {
1381                 err = unix_validate_addr(sunaddr, alen);
1382                 if (err)
1383                         goto out;
1384
1385                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1386                 if (err)
1387                         goto out;
1388
1389                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1390                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1391                     !unix_sk(sk)->addr) {
1392                         err = unix_autobind(sk);
1393                         if (err)
1394                                 goto out;
1395                 }
1396
1397 restart:
1398                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1399                 if (IS_ERR(other)) {
1400                         err = PTR_ERR(other);
1401                         goto out;
1402                 }
1403
1404                 unix_state_double_lock(sk, other);
1405
1406                 /* Apparently VFS overslept socket death. Retry. */
1407                 if (sock_flag(other, SOCK_DEAD)) {
1408                         unix_state_double_unlock(sk, other);
1409                         sock_put(other);
1410                         goto restart;
1411                 }
1412
1413                 err = -EPERM;
1414                 if (!unix_may_send(sk, other))
1415                         goto out_unlock;
1416
1417                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1418                 if (err)
1419                         goto out_unlock;
1420
1421                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1422         } else {
1423                 /*
1424                  *      1003.1g breaking connected state with AF_UNSPEC
1425                  */
1426                 other = NULL;
1427                 unix_state_double_lock(sk, other);
1428         }
1429
1430         /*
1431          * If it was connected, reconnect.
1432          */
1433         if (unix_peer(sk)) {
1434                 struct sock *old_peer = unix_peer(sk);
1435
1436                 unix_peer(sk) = other;
1437                 if (!other)
1438                         sk->sk_state = TCP_CLOSE;
1439                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1440
1441                 unix_state_double_unlock(sk, other);
1442
1443                 if (other != old_peer)
1444                         unix_dgram_disconnected(sk, old_peer);
1445                 sock_put(old_peer);
1446         } else {
1447                 unix_peer(sk) = other;
1448                 unix_state_double_unlock(sk, other);
1449         }
1450
1451         return 0;
1452
1453 out_unlock:
1454         unix_state_double_unlock(sk, other);
1455         sock_put(other);
1456 out:
1457         return err;
1458 }
1459
1460 static long unix_wait_for_peer(struct sock *other, long timeo)
1461         __releases(&unix_sk(other)->lock)
1462 {
1463         struct unix_sock *u = unix_sk(other);
1464         int sched;
1465         DEFINE_WAIT(wait);
1466
1467         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1468
1469         sched = !sock_flag(other, SOCK_DEAD) &&
1470                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1471                 unix_recvq_full_lockless(other);
1472
1473         unix_state_unlock(other);
1474
1475         if (sched)
1476                 timeo = schedule_timeout(timeo);
1477
1478         finish_wait(&u->peer_wait, &wait);
1479         return timeo;
1480 }
1481
1482 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1483                                int addr_len, int flags)
1484 {
1485         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1486         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1487         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1488         struct net *net = sock_net(sk);
1489         struct sk_buff *skb = NULL;
1490         long timeo;
1491         int err;
1492         int st;
1493
1494         err = unix_validate_addr(sunaddr, addr_len);
1495         if (err)
1496                 goto out;
1497
1498         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1499         if (err)
1500                 goto out;
1501
1502         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1503              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1504                 err = unix_autobind(sk);
1505                 if (err)
1506                         goto out;
1507         }
1508
1509         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1510
1511         /* First of all allocate resources.
1512            If we will make it after state is locked,
1513            we will have to recheck all again in any case.
1514          */
1515
1516         /* create new sock for complete connection */
1517         newsk = unix_create1(net, NULL, 0, sock->type);
1518         if (IS_ERR(newsk)) {
1519                 err = PTR_ERR(newsk);
1520                 newsk = NULL;
1521                 goto out;
1522         }
1523
1524         err = -ENOMEM;
1525
1526         /* Allocate skb for sending to listening sock */
1527         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1528         if (skb == NULL)
1529                 goto out;
1530
1531 restart:
1532         /*  Find listening sock. */
1533         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1534         if (IS_ERR(other)) {
1535                 err = PTR_ERR(other);
1536                 other = NULL;
1537                 goto out;
1538         }
1539
1540         /* Latch state of peer */
1541         unix_state_lock(other);
1542
1543         /* Apparently VFS overslept socket death. Retry. */
1544         if (sock_flag(other, SOCK_DEAD)) {
1545                 unix_state_unlock(other);
1546                 sock_put(other);
1547                 goto restart;
1548         }
1549
1550         err = -ECONNREFUSED;
1551         if (other->sk_state != TCP_LISTEN)
1552                 goto out_unlock;
1553         if (other->sk_shutdown & RCV_SHUTDOWN)
1554                 goto out_unlock;
1555
1556         if (unix_recvq_full(other)) {
1557                 err = -EAGAIN;
1558                 if (!timeo)
1559                         goto out_unlock;
1560
1561                 timeo = unix_wait_for_peer(other, timeo);
1562
1563                 err = sock_intr_errno(timeo);
1564                 if (signal_pending(current))
1565                         goto out;
1566                 sock_put(other);
1567                 goto restart;
1568         }
1569
1570         /* Latch our state.
1571
1572            It is tricky place. We need to grab our state lock and cannot
1573            drop lock on peer. It is dangerous because deadlock is
1574            possible. Connect to self case and simultaneous
1575            attempt to connect are eliminated by checking socket
1576            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1577            check this before attempt to grab lock.
1578
1579            Well, and we have to recheck the state after socket locked.
1580          */
1581         st = sk->sk_state;
1582
1583         switch (st) {
1584         case TCP_CLOSE:
1585                 /* This is ok... continue with connect */
1586                 break;
1587         case TCP_ESTABLISHED:
1588                 /* Socket is already connected */
1589                 err = -EISCONN;
1590                 goto out_unlock;
1591         default:
1592                 err = -EINVAL;
1593                 goto out_unlock;
1594         }
1595
1596         unix_state_lock_nested(sk);
1597
1598         if (sk->sk_state != st) {
1599                 unix_state_unlock(sk);
1600                 unix_state_unlock(other);
1601                 sock_put(other);
1602                 goto restart;
1603         }
1604
1605         err = security_unix_stream_connect(sk, other, newsk);
1606         if (err) {
1607                 unix_state_unlock(sk);
1608                 goto out_unlock;
1609         }
1610
1611         /* The way is open! Fastly set all the necessary fields... */
1612
1613         sock_hold(sk);
1614         unix_peer(newsk)        = sk;
1615         newsk->sk_state         = TCP_ESTABLISHED;
1616         newsk->sk_type          = sk->sk_type;
1617         init_peercred(newsk);
1618         newu = unix_sk(newsk);
1619         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1620         otheru = unix_sk(other);
1621
1622         /* copy address information from listening to new sock
1623          *
1624          * The contents of *(otheru->addr) and otheru->path
1625          * are seen fully set up here, since we have found
1626          * otheru in hash under its lock.  Insertion into the
1627          * hash chain we'd found it in had been done in an
1628          * earlier critical area protected by the chain's lock,
1629          * the same one where we'd set *(otheru->addr) contents,
1630          * as well as otheru->path and otheru->addr itself.
1631          *
1632          * Using smp_store_release() here to set newu->addr
1633          * is enough to make those stores, as well as stores
1634          * to newu->path visible to anyone who gets newu->addr
1635          * by smp_load_acquire().  IOW, the same warranties
1636          * as for unix_sock instances bound in unix_bind() or
1637          * in unix_autobind().
1638          */
1639         if (otheru->path.dentry) {
1640                 path_get(&otheru->path);
1641                 newu->path = otheru->path;
1642         }
1643         refcount_inc(&otheru->addr->refcnt);
1644         smp_store_release(&newu->addr, otheru->addr);
1645
1646         /* Set credentials */
1647         copy_peercred(sk, other);
1648
1649         sock->state     = SS_CONNECTED;
1650         sk->sk_state    = TCP_ESTABLISHED;
1651         sock_hold(newsk);
1652
1653         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1654         unix_peer(sk)   = newsk;
1655
1656         unix_state_unlock(sk);
1657
1658         /* take ten and send info to listening sock */
1659         spin_lock(&other->sk_receive_queue.lock);
1660         __skb_queue_tail(&other->sk_receive_queue, skb);
1661         spin_unlock(&other->sk_receive_queue.lock);
1662         unix_state_unlock(other);
1663         other->sk_data_ready(other);
1664         sock_put(other);
1665         return 0;
1666
1667 out_unlock:
1668         if (other)
1669                 unix_state_unlock(other);
1670
1671 out:
1672         kfree_skb(skb);
1673         if (newsk)
1674                 unix_release_sock(newsk, 0);
1675         if (other)
1676                 sock_put(other);
1677         return err;
1678 }
1679
1680 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1681 {
1682         struct sock *ska = socka->sk, *skb = sockb->sk;
1683
1684         /* Join our sockets back to back */
1685         sock_hold(ska);
1686         sock_hold(skb);
1687         unix_peer(ska) = skb;
1688         unix_peer(skb) = ska;
1689         init_peercred(ska);
1690         init_peercred(skb);
1691
1692         ska->sk_state = TCP_ESTABLISHED;
1693         skb->sk_state = TCP_ESTABLISHED;
1694         socka->state  = SS_CONNECTED;
1695         sockb->state  = SS_CONNECTED;
1696         return 0;
1697 }
1698
1699 static void unix_sock_inherit_flags(const struct socket *old,
1700                                     struct socket *new)
1701 {
1702         if (test_bit(SOCK_PASSCRED, &old->flags))
1703                 set_bit(SOCK_PASSCRED, &new->flags);
1704         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1705                 set_bit(SOCK_PASSPIDFD, &new->flags);
1706         if (test_bit(SOCK_PASSSEC, &old->flags))
1707                 set_bit(SOCK_PASSSEC, &new->flags);
1708 }
1709
1710 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1711                        bool kern)
1712 {
1713         struct sock *sk = sock->sk;
1714         struct sock *tsk;
1715         struct sk_buff *skb;
1716         int err;
1717
1718         err = -EOPNOTSUPP;
1719         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1720                 goto out;
1721
1722         err = -EINVAL;
1723         if (sk->sk_state != TCP_LISTEN)
1724                 goto out;
1725
1726         /* If socket state is TCP_LISTEN it cannot change (for now...),
1727          * so that no locks are necessary.
1728          */
1729
1730         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1731                                 &err);
1732         if (!skb) {
1733                 /* This means receive shutdown. */
1734                 if (err == 0)
1735                         err = -EINVAL;
1736                 goto out;
1737         }
1738
1739         tsk = skb->sk;
1740         skb_free_datagram(sk, skb);
1741         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1742
1743         /* attach accepted sock to socket */
1744         unix_state_lock(tsk);
1745         newsock->state = SS_CONNECTED;
1746         unix_sock_inherit_flags(sock, newsock);
1747         sock_graft(tsk, newsock);
1748         unix_state_unlock(tsk);
1749         return 0;
1750
1751 out:
1752         return err;
1753 }
1754
1755
1756 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1757 {
1758         struct sock *sk = sock->sk;
1759         struct unix_address *addr;
1760         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1761         int err = 0;
1762
1763         if (peer) {
1764                 sk = unix_peer_get(sk);
1765
1766                 err = -ENOTCONN;
1767                 if (!sk)
1768                         goto out;
1769                 err = 0;
1770         } else {
1771                 sock_hold(sk);
1772         }
1773
1774         addr = smp_load_acquire(&unix_sk(sk)->addr);
1775         if (!addr) {
1776                 sunaddr->sun_family = AF_UNIX;
1777                 sunaddr->sun_path[0] = 0;
1778                 err = offsetof(struct sockaddr_un, sun_path);
1779         } else {
1780                 err = addr->len;
1781                 memcpy(sunaddr, addr->name, addr->len);
1782
1783                 if (peer)
1784                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1785                                                CGROUP_UNIX_GETPEERNAME);
1786                 else
1787                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1788                                                CGROUP_UNIX_GETSOCKNAME);
1789         }
1790         sock_put(sk);
1791 out:
1792         return err;
1793 }
1794
1795 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1796 {
1797         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1798
1799         /*
1800          * Garbage collection of unix sockets starts by selecting a set of
1801          * candidate sockets which have reference only from being in flight
1802          * (total_refs == inflight_refs).  This condition is checked once during
1803          * the candidate collection phase, and candidates are marked as such, so
1804          * that non-candidates can later be ignored.  While inflight_refs is
1805          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1806          * is an instantaneous decision.
1807          *
1808          * Once a candidate, however, the socket must not be reinstalled into a
1809          * file descriptor while the garbage collection is in progress.
1810          *
1811          * If the above conditions are met, then the directed graph of
1812          * candidates (*) does not change while unix_gc_lock is held.
1813          *
1814          * Any operations that changes the file count through file descriptors
1815          * (dup, close, sendmsg) does not change the graph since candidates are
1816          * not installed in fds.
1817          *
1818          * Dequeing a candidate via recvmsg would install it into an fd, but
1819          * that takes unix_gc_lock to decrement the inflight count, so it's
1820          * serialized with garbage collection.
1821          *
1822          * MSG_PEEK is special in that it does not change the inflight count,
1823          * yet does install the socket into an fd.  The following lock/unlock
1824          * pair is to ensure serialization with garbage collection.  It must be
1825          * done between incrementing the file count and installing the file into
1826          * an fd.
1827          *
1828          * If garbage collection starts after the barrier provided by the
1829          * lock/unlock, then it will see the elevated refcount and not mark this
1830          * as a candidate.  If a garbage collection is already in progress
1831          * before the file count was incremented, then the lock/unlock pair will
1832          * ensure that garbage collection is finished before progressing to
1833          * installing the fd.
1834          *
1835          * (*) A -> B where B is on the queue of A or B is on the queue of C
1836          * which is on the queue of listening socket A.
1837          */
1838         spin_lock(&unix_gc_lock);
1839         spin_unlock(&unix_gc_lock);
1840 }
1841
1842 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1843 {
1844         int err = 0;
1845
1846         UNIXCB(skb).pid  = get_pid(scm->pid);
1847         UNIXCB(skb).uid = scm->creds.uid;
1848         UNIXCB(skb).gid = scm->creds.gid;
1849         UNIXCB(skb).fp = NULL;
1850         unix_get_secdata(scm, skb);
1851         if (scm->fp && send_fds)
1852                 err = unix_attach_fds(scm, skb);
1853
1854         skb->destructor = unix_destruct_scm;
1855         return err;
1856 }
1857
1858 static bool unix_passcred_enabled(const struct socket *sock,
1859                                   const struct sock *other)
1860 {
1861         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1862                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1863                !other->sk_socket ||
1864                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1865                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1866 }
1867
1868 /*
1869  * Some apps rely on write() giving SCM_CREDENTIALS
1870  * We include credentials if source or destination socket
1871  * asserted SOCK_PASSCRED.
1872  */
1873 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1874                             const struct sock *other)
1875 {
1876         if (UNIXCB(skb).pid)
1877                 return;
1878         if (unix_passcred_enabled(sock, other)) {
1879                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1880                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1881         }
1882 }
1883
1884 static bool unix_skb_scm_eq(struct sk_buff *skb,
1885                             struct scm_cookie *scm)
1886 {
1887         return UNIXCB(skb).pid == scm->pid &&
1888                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1889                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1890                unix_secdata_eq(scm, skb);
1891 }
1892
1893 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1894 {
1895         struct scm_fp_list *fp = UNIXCB(skb).fp;
1896         struct unix_sock *u = unix_sk(sk);
1897
1898         if (unlikely(fp && fp->count))
1899                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1900 }
1901
1902 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1903 {
1904         struct scm_fp_list *fp = UNIXCB(skb).fp;
1905         struct unix_sock *u = unix_sk(sk);
1906
1907         if (unlikely(fp && fp->count))
1908                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1909 }
1910
1911 /*
1912  *      Send AF_UNIX data.
1913  */
1914
1915 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1916                               size_t len)
1917 {
1918         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1919         struct sock *sk = sock->sk, *other = NULL;
1920         struct unix_sock *u = unix_sk(sk);
1921         struct scm_cookie scm;
1922         struct sk_buff *skb;
1923         int data_len = 0;
1924         int sk_locked;
1925         long timeo;
1926         int err;
1927
1928         wait_for_unix_gc();
1929         err = scm_send(sock, msg, &scm, false);
1930         if (err < 0)
1931                 return err;
1932
1933         err = -EOPNOTSUPP;
1934         if (msg->msg_flags&MSG_OOB)
1935                 goto out;
1936
1937         if (msg->msg_namelen) {
1938                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1939                 if (err)
1940                         goto out;
1941
1942                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1943                                                             msg->msg_name,
1944                                                             &msg->msg_namelen,
1945                                                             NULL);
1946                 if (err)
1947                         goto out;
1948         } else {
1949                 sunaddr = NULL;
1950                 err = -ENOTCONN;
1951                 other = unix_peer_get(sk);
1952                 if (!other)
1953                         goto out;
1954         }
1955
1956         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1957              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1958                 err = unix_autobind(sk);
1959                 if (err)
1960                         goto out;
1961         }
1962
1963         err = -EMSGSIZE;
1964         if (len > sk->sk_sndbuf - 32)
1965                 goto out;
1966
1967         if (len > SKB_MAX_ALLOC) {
1968                 data_len = min_t(size_t,
1969                                  len - SKB_MAX_ALLOC,
1970                                  MAX_SKB_FRAGS * PAGE_SIZE);
1971                 data_len = PAGE_ALIGN(data_len);
1972
1973                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974         }
1975
1976         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977                                    msg->msg_flags & MSG_DONTWAIT, &err,
1978                                    PAGE_ALLOC_COSTLY_ORDER);
1979         if (skb == NULL)
1980                 goto out;
1981
1982         err = unix_scm_to_skb(&scm, skb, true);
1983         if (err < 0)
1984                 goto out_free;
1985
1986         skb_put(skb, len - data_len);
1987         skb->data_len = data_len;
1988         skb->len = len;
1989         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990         if (err)
1991                 goto out_free;
1992
1993         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994
1995 restart:
1996         if (!other) {
1997                 err = -ECONNRESET;
1998                 if (sunaddr == NULL)
1999                         goto out_free;
2000
2001                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002                                         sk->sk_type);
2003                 if (IS_ERR(other)) {
2004                         err = PTR_ERR(other);
2005                         other = NULL;
2006                         goto out_free;
2007                 }
2008         }
2009
2010         if (sk_filter(other, skb) < 0) {
2011                 /* Toss the packet but do not return any error to the sender */
2012                 err = len;
2013                 goto out_free;
2014         }
2015
2016         sk_locked = 0;
2017         unix_state_lock(other);
2018 restart_locked:
2019         err = -EPERM;
2020         if (!unix_may_send(sk, other))
2021                 goto out_unlock;
2022
2023         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024                 /*
2025                  *      Check with 1003.1g - what should
2026                  *      datagram error
2027                  */
2028                 unix_state_unlock(other);
2029                 sock_put(other);
2030
2031                 if (!sk_locked)
2032                         unix_state_lock(sk);
2033
2034                 err = 0;
2035                 if (sk->sk_type == SOCK_SEQPACKET) {
2036                         /* We are here only when racing with unix_release_sock()
2037                          * is clearing @other. Never change state to TCP_CLOSE
2038                          * unlike SOCK_DGRAM wants.
2039                          */
2040                         unix_state_unlock(sk);
2041                         err = -EPIPE;
2042                 } else if (unix_peer(sk) == other) {
2043                         unix_peer(sk) = NULL;
2044                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045
2046                         sk->sk_state = TCP_CLOSE;
2047                         unix_state_unlock(sk);
2048
2049                         unix_dgram_disconnected(sk, other);
2050                         sock_put(other);
2051                         err = -ECONNREFUSED;
2052                 } else {
2053                         unix_state_unlock(sk);
2054                 }
2055
2056                 other = NULL;
2057                 if (err)
2058                         goto out_free;
2059                 goto restart;
2060         }
2061
2062         err = -EPIPE;
2063         if (other->sk_shutdown & RCV_SHUTDOWN)
2064                 goto out_unlock;
2065
2066         if (sk->sk_type != SOCK_SEQPACKET) {
2067                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068                 if (err)
2069                         goto out_unlock;
2070         }
2071
2072         /* other == sk && unix_peer(other) != sk if
2073          * - unix_peer(sk) == NULL, destination address bound to sk
2074          * - unix_peer(sk) == sk by time of get but disconnected before lock
2075          */
2076         if (other != sk &&
2077             unlikely(unix_peer(other) != sk &&
2078             unix_recvq_full_lockless(other))) {
2079                 if (timeo) {
2080                         timeo = unix_wait_for_peer(other, timeo);
2081
2082                         err = sock_intr_errno(timeo);
2083                         if (signal_pending(current))
2084                                 goto out_free;
2085
2086                         goto restart;
2087                 }
2088
2089                 if (!sk_locked) {
2090                         unix_state_unlock(other);
2091                         unix_state_double_lock(sk, other);
2092                 }
2093
2094                 if (unix_peer(sk) != other ||
2095                     unix_dgram_peer_wake_me(sk, other)) {
2096                         err = -EAGAIN;
2097                         sk_locked = 1;
2098                         goto out_unlock;
2099                 }
2100
2101                 if (!sk_locked) {
2102                         sk_locked = 1;
2103                         goto restart_locked;
2104                 }
2105         }
2106
2107         if (unlikely(sk_locked))
2108                 unix_state_unlock(sk);
2109
2110         if (sock_flag(other, SOCK_RCVTSTAMP))
2111                 __net_timestamp(skb);
2112         maybe_add_creds(skb, sock, other);
2113         scm_stat_add(other, skb);
2114         skb_queue_tail(&other->sk_receive_queue, skb);
2115         unix_state_unlock(other);
2116         other->sk_data_ready(other);
2117         sock_put(other);
2118         scm_destroy(&scm);
2119         return len;
2120
2121 out_unlock:
2122         if (sk_locked)
2123                 unix_state_unlock(sk);
2124         unix_state_unlock(other);
2125 out_free:
2126         kfree_skb(skb);
2127 out:
2128         if (other)
2129                 sock_put(other);
2130         scm_destroy(&scm);
2131         return err;
2132 }
2133
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141                      struct scm_cookie *scm, bool fds_sent)
2142 {
2143         struct unix_sock *ousk = unix_sk(other);
2144         struct sk_buff *skb;
2145         int err = 0;
2146
2147         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148
2149         if (!skb)
2150                 return err;
2151
2152         err = unix_scm_to_skb(scm, skb, !fds_sent);
2153         if (err < 0) {
2154                 kfree_skb(skb);
2155                 return err;
2156         }
2157         skb_put(skb, 1);
2158         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159
2160         if (err) {
2161                 kfree_skb(skb);
2162                 return err;
2163         }
2164
2165         unix_state_lock(other);
2166
2167         if (sock_flag(other, SOCK_DEAD) ||
2168             (other->sk_shutdown & RCV_SHUTDOWN)) {
2169                 unix_state_unlock(other);
2170                 kfree_skb(skb);
2171                 return -EPIPE;
2172         }
2173
2174         maybe_add_creds(skb, sock, other);
2175         skb_get(skb);
2176
2177         if (ousk->oob_skb)
2178                 consume_skb(ousk->oob_skb);
2179
2180         WRITE_ONCE(ousk->oob_skb, skb);
2181
2182         scm_stat_add(other, skb);
2183         skb_queue_tail(&other->sk_receive_queue, skb);
2184         sk_send_sigurg(other);
2185         unix_state_unlock(other);
2186         other->sk_data_ready(other);
2187
2188         return err;
2189 }
2190 #endif
2191
2192 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2193                                size_t len)
2194 {
2195         struct sock *sk = sock->sk;
2196         struct sock *other = NULL;
2197         int err, size;
2198         struct sk_buff *skb;
2199         int sent = 0;
2200         struct scm_cookie scm;
2201         bool fds_sent = false;
2202         int data_len;
2203
2204         wait_for_unix_gc();
2205         err = scm_send(sock, msg, &scm, false);
2206         if (err < 0)
2207                 return err;
2208
2209         err = -EOPNOTSUPP;
2210         if (msg->msg_flags & MSG_OOB) {
2211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2212                 if (len)
2213                         len--;
2214                 else
2215 #endif
2216                         goto out_err;
2217         }
2218
2219         if (msg->msg_namelen) {
2220                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2221                 goto out_err;
2222         } else {
2223                 err = -ENOTCONN;
2224                 other = unix_peer(sk);
2225                 if (!other)
2226                         goto out_err;
2227         }
2228
2229         if (sk->sk_shutdown & SEND_SHUTDOWN)
2230                 goto pipe_err;
2231
2232         while (sent < len) {
2233                 size = len - sent;
2234
2235                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2236                         skb = sock_alloc_send_pskb(sk, 0, 0,
2237                                                    msg->msg_flags & MSG_DONTWAIT,
2238                                                    &err, 0);
2239                 } else {
2240                         /* Keep two messages in the pipe so it schedules better */
2241                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2242
2243                         /* allow fallback to order-0 allocations */
2244                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2245
2246                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2247
2248                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2249
2250                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2251                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2252                                                    get_order(UNIX_SKB_FRAGS_SZ));
2253                 }
2254                 if (!skb)
2255                         goto out_err;
2256
2257                 /* Only send the fds in the first buffer */
2258                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2259                 if (err < 0) {
2260                         kfree_skb(skb);
2261                         goto out_err;
2262                 }
2263                 fds_sent = true;
2264
2265                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2266                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2267                                                    sk->sk_allocation);
2268                         if (err < 0) {
2269                                 kfree_skb(skb);
2270                                 goto out_err;
2271                         }
2272                         size = err;
2273                         refcount_add(size, &sk->sk_wmem_alloc);
2274                 } else {
2275                         skb_put(skb, size - data_len);
2276                         skb->data_len = data_len;
2277                         skb->len = size;
2278                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2279                         if (err) {
2280                                 kfree_skb(skb);
2281                                 goto out_err;
2282                         }
2283                 }
2284
2285                 unix_state_lock(other);
2286
2287                 if (sock_flag(other, SOCK_DEAD) ||
2288                     (other->sk_shutdown & RCV_SHUTDOWN))
2289                         goto pipe_err_free;
2290
2291                 maybe_add_creds(skb, sock, other);
2292                 scm_stat_add(other, skb);
2293                 skb_queue_tail(&other->sk_receive_queue, skb);
2294                 unix_state_unlock(other);
2295                 other->sk_data_ready(other);
2296                 sent += size;
2297         }
2298
2299 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2300         if (msg->msg_flags & MSG_OOB) {
2301                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2302                 if (err)
2303                         goto out_err;
2304                 sent++;
2305         }
2306 #endif
2307
2308         scm_destroy(&scm);
2309
2310         return sent;
2311
2312 pipe_err_free:
2313         unix_state_unlock(other);
2314         kfree_skb(skb);
2315 pipe_err:
2316         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2317                 send_sig(SIGPIPE, current, 0);
2318         err = -EPIPE;
2319 out_err:
2320         scm_destroy(&scm);
2321         return sent ? : err;
2322 }
2323
2324 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2325                                   size_t len)
2326 {
2327         int err;
2328         struct sock *sk = sock->sk;
2329
2330         err = sock_error(sk);
2331         if (err)
2332                 return err;
2333
2334         if (sk->sk_state != TCP_ESTABLISHED)
2335                 return -ENOTCONN;
2336
2337         if (msg->msg_namelen)
2338                 msg->msg_namelen = 0;
2339
2340         return unix_dgram_sendmsg(sock, msg, len);
2341 }
2342
2343 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2344                                   size_t size, int flags)
2345 {
2346         struct sock *sk = sock->sk;
2347
2348         if (sk->sk_state != TCP_ESTABLISHED)
2349                 return -ENOTCONN;
2350
2351         return unix_dgram_recvmsg(sock, msg, size, flags);
2352 }
2353
2354 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2355 {
2356         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2357
2358         if (addr) {
2359                 msg->msg_namelen = addr->len;
2360                 memcpy(msg->msg_name, addr->name, addr->len);
2361         }
2362 }
2363
2364 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2365                          int flags)
2366 {
2367         struct scm_cookie scm;
2368         struct socket *sock = sk->sk_socket;
2369         struct unix_sock *u = unix_sk(sk);
2370         struct sk_buff *skb, *last;
2371         long timeo;
2372         int skip;
2373         int err;
2374
2375         err = -EOPNOTSUPP;
2376         if (flags&MSG_OOB)
2377                 goto out;
2378
2379         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2380
2381         do {
2382                 mutex_lock(&u->iolock);
2383
2384                 skip = sk_peek_offset(sk, flags);
2385                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2386                                               &skip, &err, &last);
2387                 if (skb) {
2388                         if (!(flags & MSG_PEEK))
2389                                 scm_stat_del(sk, skb);
2390                         break;
2391                 }
2392
2393                 mutex_unlock(&u->iolock);
2394
2395                 if (err != -EAGAIN)
2396                         break;
2397         } while (timeo &&
2398                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2399                                               &err, &timeo, last));
2400
2401         if (!skb) { /* implies iolock unlocked */
2402                 unix_state_lock(sk);
2403                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2404                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2405                     (sk->sk_shutdown & RCV_SHUTDOWN))
2406                         err = 0;
2407                 unix_state_unlock(sk);
2408                 goto out;
2409         }
2410
2411         if (wq_has_sleeper(&u->peer_wait))
2412                 wake_up_interruptible_sync_poll(&u->peer_wait,
2413                                                 EPOLLOUT | EPOLLWRNORM |
2414                                                 EPOLLWRBAND);
2415
2416         if (msg->msg_name) {
2417                 unix_copy_addr(msg, skb->sk);
2418
2419                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2420                                                       msg->msg_name,
2421                                                       &msg->msg_namelen);
2422         }
2423
2424         if (size > skb->len - skip)
2425                 size = skb->len - skip;
2426         else if (size < skb->len - skip)
2427                 msg->msg_flags |= MSG_TRUNC;
2428
2429         err = skb_copy_datagram_msg(skb, skip, msg, size);
2430         if (err)
2431                 goto out_free;
2432
2433         if (sock_flag(sk, SOCK_RCVTSTAMP))
2434                 __sock_recv_timestamp(msg, sk, skb);
2435
2436         memset(&scm, 0, sizeof(scm));
2437
2438         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2439         unix_set_secdata(&scm, skb);
2440
2441         if (!(flags & MSG_PEEK)) {
2442                 if (UNIXCB(skb).fp)
2443                         unix_detach_fds(&scm, skb);
2444
2445                 sk_peek_offset_bwd(sk, skb->len);
2446         } else {
2447                 /* It is questionable: on PEEK we could:
2448                    - do not return fds - good, but too simple 8)
2449                    - return fds, and do not return them on read (old strategy,
2450                      apparently wrong)
2451                    - clone fds (I chose it for now, it is the most universal
2452                      solution)
2453
2454                    POSIX 1003.1g does not actually define this clearly
2455                    at all. POSIX 1003.1g doesn't define a lot of things
2456                    clearly however!
2457
2458                 */
2459
2460                 sk_peek_offset_fwd(sk, size);
2461
2462                 if (UNIXCB(skb).fp)
2463                         unix_peek_fds(&scm, skb);
2464         }
2465         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2466
2467         scm_recv_unix(sock, msg, &scm, flags);
2468
2469 out_free:
2470         skb_free_datagram(sk, skb);
2471         mutex_unlock(&u->iolock);
2472 out:
2473         return err;
2474 }
2475
2476 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2477                               int flags)
2478 {
2479         struct sock *sk = sock->sk;
2480
2481 #ifdef CONFIG_BPF_SYSCALL
2482         const struct proto *prot = READ_ONCE(sk->sk_prot);
2483
2484         if (prot != &unix_dgram_proto)
2485                 return prot->recvmsg(sk, msg, size, flags, NULL);
2486 #endif
2487         return __unix_dgram_recvmsg(sk, msg, size, flags);
2488 }
2489
2490 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2491 {
2492         struct unix_sock *u = unix_sk(sk);
2493         struct sk_buff *skb;
2494         int err;
2495
2496         mutex_lock(&u->iolock);
2497         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2498         mutex_unlock(&u->iolock);
2499         if (!skb)
2500                 return err;
2501
2502         return recv_actor(sk, skb);
2503 }
2504
2505 /*
2506  *      Sleep until more data has arrived. But check for races..
2507  */
2508 static long unix_stream_data_wait(struct sock *sk, long timeo,
2509                                   struct sk_buff *last, unsigned int last_len,
2510                                   bool freezable)
2511 {
2512         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2513         struct sk_buff *tail;
2514         DEFINE_WAIT(wait);
2515
2516         unix_state_lock(sk);
2517
2518         for (;;) {
2519                 prepare_to_wait(sk_sleep(sk), &wait, state);
2520
2521                 tail = skb_peek_tail(&sk->sk_receive_queue);
2522                 if (tail != last ||
2523                     (tail && tail->len != last_len) ||
2524                     sk->sk_err ||
2525                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2526                     signal_pending(current) ||
2527                     !timeo)
2528                         break;
2529
2530                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2531                 unix_state_unlock(sk);
2532                 timeo = schedule_timeout(timeo);
2533                 unix_state_lock(sk);
2534
2535                 if (sock_flag(sk, SOCK_DEAD))
2536                         break;
2537
2538                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2539         }
2540
2541         finish_wait(sk_sleep(sk), &wait);
2542         unix_state_unlock(sk);
2543         return timeo;
2544 }
2545
2546 static unsigned int unix_skb_len(const struct sk_buff *skb)
2547 {
2548         return skb->len - UNIXCB(skb).consumed;
2549 }
2550
2551 struct unix_stream_read_state {
2552         int (*recv_actor)(struct sk_buff *, int, int,
2553                           struct unix_stream_read_state *);
2554         struct socket *socket;
2555         struct msghdr *msg;
2556         struct pipe_inode_info *pipe;
2557         size_t size;
2558         int flags;
2559         unsigned int splice_flags;
2560 };
2561
2562 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2563 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2564 {
2565         struct socket *sock = state->socket;
2566         struct sock *sk = sock->sk;
2567         struct unix_sock *u = unix_sk(sk);
2568         int chunk = 1;
2569         struct sk_buff *oob_skb;
2570
2571         mutex_lock(&u->iolock);
2572         unix_state_lock(sk);
2573
2574         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2575                 unix_state_unlock(sk);
2576                 mutex_unlock(&u->iolock);
2577                 return -EINVAL;
2578         }
2579
2580         oob_skb = u->oob_skb;
2581
2582         if (!(state->flags & MSG_PEEK))
2583                 WRITE_ONCE(u->oob_skb, NULL);
2584         else
2585                 skb_get(oob_skb);
2586         unix_state_unlock(sk);
2587
2588         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2589
2590         if (!(state->flags & MSG_PEEK))
2591                 UNIXCB(oob_skb).consumed += 1;
2592
2593         consume_skb(oob_skb);
2594
2595         mutex_unlock(&u->iolock);
2596
2597         if (chunk < 0)
2598                 return -EFAULT;
2599
2600         state->msg->msg_flags |= MSG_OOB;
2601         return 1;
2602 }
2603
2604 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2605                                   int flags, int copied)
2606 {
2607         struct unix_sock *u = unix_sk(sk);
2608
2609         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2610                 skb_unlink(skb, &sk->sk_receive_queue);
2611                 consume_skb(skb);
2612                 skb = NULL;
2613         } else {
2614                 if (skb == u->oob_skb) {
2615                         if (copied) {
2616                                 skb = NULL;
2617                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2618                                 if (!(flags & MSG_PEEK)) {
2619                                         WRITE_ONCE(u->oob_skb, NULL);
2620                                         consume_skb(skb);
2621                                 }
2622                         } else if (!(flags & MSG_PEEK)) {
2623                                 skb_unlink(skb, &sk->sk_receive_queue);
2624                                 consume_skb(skb);
2625                                 skb = skb_peek(&sk->sk_receive_queue);
2626                         }
2627                 }
2628         }
2629         return skb;
2630 }
2631 #endif
2632
2633 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2634 {
2635         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2636                 return -ENOTCONN;
2637
2638         return unix_read_skb(sk, recv_actor);
2639 }
2640
2641 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2642                                     bool freezable)
2643 {
2644         struct scm_cookie scm;
2645         struct socket *sock = state->socket;
2646         struct sock *sk = sock->sk;
2647         struct unix_sock *u = unix_sk(sk);
2648         int copied = 0;
2649         int flags = state->flags;
2650         int noblock = flags & MSG_DONTWAIT;
2651         bool check_creds = false;
2652         int target;
2653         int err = 0;
2654         long timeo;
2655         int skip;
2656         size_t size = state->size;
2657         unsigned int last_len;
2658
2659         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2660                 err = -EINVAL;
2661                 goto out;
2662         }
2663
2664         if (unlikely(flags & MSG_OOB)) {
2665                 err = -EOPNOTSUPP;
2666 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2667                 err = unix_stream_recv_urg(state);
2668 #endif
2669                 goto out;
2670         }
2671
2672         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2673         timeo = sock_rcvtimeo(sk, noblock);
2674
2675         memset(&scm, 0, sizeof(scm));
2676
2677         /* Lock the socket to prevent queue disordering
2678          * while sleeps in memcpy_tomsg
2679          */
2680         mutex_lock(&u->iolock);
2681
2682         skip = max(sk_peek_offset(sk, flags), 0);
2683
2684         do {
2685                 int chunk;
2686                 bool drop_skb;
2687                 struct sk_buff *skb, *last;
2688
2689 redo:
2690                 unix_state_lock(sk);
2691                 if (sock_flag(sk, SOCK_DEAD)) {
2692                         err = -ECONNRESET;
2693                         goto unlock;
2694                 }
2695                 last = skb = skb_peek(&sk->sk_receive_queue);
2696                 last_len = last ? last->len : 0;
2697
2698 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2699                 if (skb) {
2700                         skb = manage_oob(skb, sk, flags, copied);
2701                         if (!skb) {
2702                                 unix_state_unlock(sk);
2703                                 if (copied)
2704                                         break;
2705                                 goto redo;
2706                         }
2707                 }
2708 #endif
2709 again:
2710                 if (skb == NULL) {
2711                         if (copied >= target)
2712                                 goto unlock;
2713
2714                         /*
2715                          *      POSIX 1003.1g mandates this order.
2716                          */
2717
2718                         err = sock_error(sk);
2719                         if (err)
2720                                 goto unlock;
2721                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2722                                 goto unlock;
2723
2724                         unix_state_unlock(sk);
2725                         if (!timeo) {
2726                                 err = -EAGAIN;
2727                                 break;
2728                         }
2729
2730                         mutex_unlock(&u->iolock);
2731
2732                         timeo = unix_stream_data_wait(sk, timeo, last,
2733                                                       last_len, freezable);
2734
2735                         if (signal_pending(current)) {
2736                                 err = sock_intr_errno(timeo);
2737                                 scm_destroy(&scm);
2738                                 goto out;
2739                         }
2740
2741                         mutex_lock(&u->iolock);
2742                         goto redo;
2743 unlock:
2744                         unix_state_unlock(sk);
2745                         break;
2746                 }
2747
2748                 while (skip >= unix_skb_len(skb)) {
2749                         skip -= unix_skb_len(skb);
2750                         last = skb;
2751                         last_len = skb->len;
2752                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2753                         if (!skb)
2754                                 goto again;
2755                 }
2756
2757                 unix_state_unlock(sk);
2758
2759                 if (check_creds) {
2760                         /* Never glue messages from different writers */
2761                         if (!unix_skb_scm_eq(skb, &scm))
2762                                 break;
2763                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2764                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2765                         /* Copy credentials */
2766                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2767                         unix_set_secdata(&scm, skb);
2768                         check_creds = true;
2769                 }
2770
2771                 /* Copy address just once */
2772                 if (state->msg && state->msg->msg_name) {
2773                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2774                                          state->msg->msg_name);
2775                         unix_copy_addr(state->msg, skb->sk);
2776
2777                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2778                                                               state->msg->msg_name,
2779                                                               &state->msg->msg_namelen);
2780
2781                         sunaddr = NULL;
2782                 }
2783
2784                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2785                 skb_get(skb);
2786                 chunk = state->recv_actor(skb, skip, chunk, state);
2787                 drop_skb = !unix_skb_len(skb);
2788                 /* skb is only safe to use if !drop_skb */
2789                 consume_skb(skb);
2790                 if (chunk < 0) {
2791                         if (copied == 0)
2792                                 copied = -EFAULT;
2793                         break;
2794                 }
2795                 copied += chunk;
2796                 size -= chunk;
2797
2798                 if (drop_skb) {
2799                         /* the skb was touched by a concurrent reader;
2800                          * we should not expect anything from this skb
2801                          * anymore and assume it invalid - we can be
2802                          * sure it was dropped from the socket queue
2803                          *
2804                          * let's report a short read
2805                          */
2806                         err = 0;
2807                         break;
2808                 }
2809
2810                 /* Mark read part of skb as used */
2811                 if (!(flags & MSG_PEEK)) {
2812                         UNIXCB(skb).consumed += chunk;
2813
2814                         sk_peek_offset_bwd(sk, chunk);
2815
2816                         if (UNIXCB(skb).fp) {
2817                                 scm_stat_del(sk, skb);
2818                                 unix_detach_fds(&scm, skb);
2819                         }
2820
2821                         if (unix_skb_len(skb))
2822                                 break;
2823
2824                         skb_unlink(skb, &sk->sk_receive_queue);
2825                         consume_skb(skb);
2826
2827                         if (scm.fp)
2828                                 break;
2829                 } else {
2830                         /* It is questionable, see note in unix_dgram_recvmsg.
2831                          */
2832                         if (UNIXCB(skb).fp)
2833                                 unix_peek_fds(&scm, skb);
2834
2835                         sk_peek_offset_fwd(sk, chunk);
2836
2837                         if (UNIXCB(skb).fp)
2838                                 break;
2839
2840                         skip = 0;
2841                         last = skb;
2842                         last_len = skb->len;
2843                         unix_state_lock(sk);
2844                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2845                         if (skb)
2846                                 goto again;
2847                         unix_state_unlock(sk);
2848                         break;
2849                 }
2850         } while (size);
2851
2852         mutex_unlock(&u->iolock);
2853         if (state->msg)
2854                 scm_recv_unix(sock, state->msg, &scm, flags);
2855         else
2856                 scm_destroy(&scm);
2857 out:
2858         return copied ? : err;
2859 }
2860
2861 static int unix_stream_read_actor(struct sk_buff *skb,
2862                                   int skip, int chunk,
2863                                   struct unix_stream_read_state *state)
2864 {
2865         int ret;
2866
2867         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2868                                     state->msg, chunk);
2869         return ret ?: chunk;
2870 }
2871
2872 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2873                           size_t size, int flags)
2874 {
2875         struct unix_stream_read_state state = {
2876                 .recv_actor = unix_stream_read_actor,
2877                 .socket = sk->sk_socket,
2878                 .msg = msg,
2879                 .size = size,
2880                 .flags = flags
2881         };
2882
2883         return unix_stream_read_generic(&state, true);
2884 }
2885
2886 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2887                                size_t size, int flags)
2888 {
2889         struct unix_stream_read_state state = {
2890                 .recv_actor = unix_stream_read_actor,
2891                 .socket = sock,
2892                 .msg = msg,
2893                 .size = size,
2894                 .flags = flags
2895         };
2896
2897 #ifdef CONFIG_BPF_SYSCALL
2898         struct sock *sk = sock->sk;
2899         const struct proto *prot = READ_ONCE(sk->sk_prot);
2900
2901         if (prot != &unix_stream_proto)
2902                 return prot->recvmsg(sk, msg, size, flags, NULL);
2903 #endif
2904         return unix_stream_read_generic(&state, true);
2905 }
2906
2907 static int unix_stream_splice_actor(struct sk_buff *skb,
2908                                     int skip, int chunk,
2909                                     struct unix_stream_read_state *state)
2910 {
2911         return skb_splice_bits(skb, state->socket->sk,
2912                                UNIXCB(skb).consumed + skip,
2913                                state->pipe, chunk, state->splice_flags);
2914 }
2915
2916 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2917                                        struct pipe_inode_info *pipe,
2918                                        size_t size, unsigned int flags)
2919 {
2920         struct unix_stream_read_state state = {
2921                 .recv_actor = unix_stream_splice_actor,
2922                 .socket = sock,
2923                 .pipe = pipe,
2924                 .size = size,
2925                 .splice_flags = flags,
2926         };
2927
2928         if (unlikely(*ppos))
2929                 return -ESPIPE;
2930
2931         if (sock->file->f_flags & O_NONBLOCK ||
2932             flags & SPLICE_F_NONBLOCK)
2933                 state.flags = MSG_DONTWAIT;
2934
2935         return unix_stream_read_generic(&state, false);
2936 }
2937
2938 static int unix_shutdown(struct socket *sock, int mode)
2939 {
2940         struct sock *sk = sock->sk;
2941         struct sock *other;
2942
2943         if (mode < SHUT_RD || mode > SHUT_RDWR)
2944                 return -EINVAL;
2945         /* This maps:
2946          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2947          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2948          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2949          */
2950         ++mode;
2951
2952         unix_state_lock(sk);
2953         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2954         other = unix_peer(sk);
2955         if (other)
2956                 sock_hold(other);
2957         unix_state_unlock(sk);
2958         sk->sk_state_change(sk);
2959
2960         if (other &&
2961                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2962
2963                 int peer_mode = 0;
2964                 const struct proto *prot = READ_ONCE(other->sk_prot);
2965
2966                 if (prot->unhash)
2967                         prot->unhash(other);
2968                 if (mode&RCV_SHUTDOWN)
2969                         peer_mode |= SEND_SHUTDOWN;
2970                 if (mode&SEND_SHUTDOWN)
2971                         peer_mode |= RCV_SHUTDOWN;
2972                 unix_state_lock(other);
2973                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2974                 unix_state_unlock(other);
2975                 other->sk_state_change(other);
2976                 if (peer_mode == SHUTDOWN_MASK)
2977                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2978                 else if (peer_mode & RCV_SHUTDOWN)
2979                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2980         }
2981         if (other)
2982                 sock_put(other);
2983
2984         return 0;
2985 }
2986
2987 long unix_inq_len(struct sock *sk)
2988 {
2989         struct sk_buff *skb;
2990         long amount = 0;
2991
2992         if (sk->sk_state == TCP_LISTEN)
2993                 return -EINVAL;
2994
2995         spin_lock(&sk->sk_receive_queue.lock);
2996         if (sk->sk_type == SOCK_STREAM ||
2997             sk->sk_type == SOCK_SEQPACKET) {
2998                 skb_queue_walk(&sk->sk_receive_queue, skb)
2999                         amount += unix_skb_len(skb);
3000         } else {
3001                 skb = skb_peek(&sk->sk_receive_queue);
3002                 if (skb)
3003                         amount = skb->len;
3004         }
3005         spin_unlock(&sk->sk_receive_queue.lock);
3006
3007         return amount;
3008 }
3009 EXPORT_SYMBOL_GPL(unix_inq_len);
3010
3011 long unix_outq_len(struct sock *sk)
3012 {
3013         return sk_wmem_alloc_get(sk);
3014 }
3015 EXPORT_SYMBOL_GPL(unix_outq_len);
3016
3017 static int unix_open_file(struct sock *sk)
3018 {
3019         struct path path;
3020         struct file *f;
3021         int fd;
3022
3023         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3024                 return -EPERM;
3025
3026         if (!smp_load_acquire(&unix_sk(sk)->addr))
3027                 return -ENOENT;
3028
3029         path = unix_sk(sk)->path;
3030         if (!path.dentry)
3031                 return -ENOENT;
3032
3033         path_get(&path);
3034
3035         fd = get_unused_fd_flags(O_CLOEXEC);
3036         if (fd < 0)
3037                 goto out;
3038
3039         f = dentry_open(&path, O_PATH, current_cred());
3040         if (IS_ERR(f)) {
3041                 put_unused_fd(fd);
3042                 fd = PTR_ERR(f);
3043                 goto out;
3044         }
3045
3046         fd_install(fd, f);
3047 out:
3048         path_put(&path);
3049
3050         return fd;
3051 }
3052
3053 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3054 {
3055         struct sock *sk = sock->sk;
3056         long amount = 0;
3057         int err;
3058
3059         switch (cmd) {
3060         case SIOCOUTQ:
3061                 amount = unix_outq_len(sk);
3062                 err = put_user(amount, (int __user *)arg);
3063                 break;
3064         case SIOCINQ:
3065                 amount = unix_inq_len(sk);
3066                 if (amount < 0)
3067                         err = amount;
3068                 else
3069                         err = put_user(amount, (int __user *)arg);
3070                 break;
3071         case SIOCUNIXFILE:
3072                 err = unix_open_file(sk);
3073                 break;
3074 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3075         case SIOCATMARK:
3076                 {
3077                         struct sk_buff *skb;
3078                         int answ = 0;
3079
3080                         skb = skb_peek(&sk->sk_receive_queue);
3081                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3082                                 answ = 1;
3083                         err = put_user(answ, (int __user *)arg);
3084                 }
3085                 break;
3086 #endif
3087         default:
3088                 err = -ENOIOCTLCMD;
3089                 break;
3090         }
3091         return err;
3092 }
3093
3094 #ifdef CONFIG_COMPAT
3095 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3096 {
3097         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3098 }
3099 #endif
3100
3101 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3102 {
3103         struct sock *sk = sock->sk;
3104         __poll_t mask;
3105         u8 shutdown;
3106
3107         sock_poll_wait(file, sock, wait);
3108         mask = 0;
3109         shutdown = READ_ONCE(sk->sk_shutdown);
3110
3111         /* exceptional events? */
3112         if (READ_ONCE(sk->sk_err))
3113                 mask |= EPOLLERR;
3114         if (shutdown == SHUTDOWN_MASK)
3115                 mask |= EPOLLHUP;
3116         if (shutdown & RCV_SHUTDOWN)
3117                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3118
3119         /* readable? */
3120         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3121                 mask |= EPOLLIN | EPOLLRDNORM;
3122         if (sk_is_readable(sk))
3123                 mask |= EPOLLIN | EPOLLRDNORM;
3124 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3125         if (READ_ONCE(unix_sk(sk)->oob_skb))
3126                 mask |= EPOLLPRI;
3127 #endif
3128
3129         /* Connection-based need to check for termination and startup */
3130         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3131             sk->sk_state == TCP_CLOSE)
3132                 mask |= EPOLLHUP;
3133
3134         /*
3135          * we set writable also when the other side has shut down the
3136          * connection. This prevents stuck sockets.
3137          */
3138         if (unix_writable(sk))
3139                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3140
3141         return mask;
3142 }
3143
3144 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3145                                     poll_table *wait)
3146 {
3147         struct sock *sk = sock->sk, *other;
3148         unsigned int writable;
3149         __poll_t mask;
3150         u8 shutdown;
3151
3152         sock_poll_wait(file, sock, wait);
3153         mask = 0;
3154         shutdown = READ_ONCE(sk->sk_shutdown);
3155
3156         /* exceptional events? */
3157         if (READ_ONCE(sk->sk_err) ||
3158             !skb_queue_empty_lockless(&sk->sk_error_queue))
3159                 mask |= EPOLLERR |
3160                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3161
3162         if (shutdown & RCV_SHUTDOWN)
3163                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3164         if (shutdown == SHUTDOWN_MASK)
3165                 mask |= EPOLLHUP;
3166
3167         /* readable? */
3168         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3169                 mask |= EPOLLIN | EPOLLRDNORM;
3170         if (sk_is_readable(sk))
3171                 mask |= EPOLLIN | EPOLLRDNORM;
3172
3173         /* Connection-based need to check for termination and startup */
3174         if (sk->sk_type == SOCK_SEQPACKET) {
3175                 if (sk->sk_state == TCP_CLOSE)
3176                         mask |= EPOLLHUP;
3177                 /* connection hasn't started yet? */
3178                 if (sk->sk_state == TCP_SYN_SENT)
3179                         return mask;
3180         }
3181
3182         /* No write status requested, avoid expensive OUT tests. */
3183         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3184                 return mask;
3185
3186         writable = unix_writable(sk);
3187         if (writable) {
3188                 unix_state_lock(sk);
3189
3190                 other = unix_peer(sk);
3191                 if (other && unix_peer(other) != sk &&
3192                     unix_recvq_full_lockless(other) &&
3193                     unix_dgram_peer_wake_me(sk, other))
3194                         writable = 0;
3195
3196                 unix_state_unlock(sk);
3197         }
3198
3199         if (writable)
3200                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3201         else
3202                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3203
3204         return mask;
3205 }
3206
3207 #ifdef CONFIG_PROC_FS
3208
3209 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3210
3211 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3212 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3213 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3214
3215 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3216 {
3217         unsigned long offset = get_offset(*pos);
3218         unsigned long bucket = get_bucket(*pos);
3219         unsigned long count = 0;
3220         struct sock *sk;
3221
3222         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3223              sk; sk = sk_next(sk)) {
3224                 if (++count == offset)
3225                         break;
3226         }
3227
3228         return sk;
3229 }
3230
3231 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3232 {
3233         unsigned long bucket = get_bucket(*pos);
3234         struct net *net = seq_file_net(seq);
3235         struct sock *sk;
3236
3237         while (bucket < UNIX_HASH_SIZE) {
3238                 spin_lock(&net->unx.table.locks[bucket]);
3239
3240                 sk = unix_from_bucket(seq, pos);
3241                 if (sk)
3242                         return sk;
3243
3244                 spin_unlock(&net->unx.table.locks[bucket]);
3245
3246                 *pos = set_bucket_offset(++bucket, 1);
3247         }
3248
3249         return NULL;
3250 }
3251
3252 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3253                                   loff_t *pos)
3254 {
3255         unsigned long bucket = get_bucket(*pos);
3256
3257         sk = sk_next(sk);
3258         if (sk)
3259                 return sk;
3260
3261
3262         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3263
3264         *pos = set_bucket_offset(++bucket, 1);
3265
3266         return unix_get_first(seq, pos);
3267 }
3268
3269 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3270 {
3271         if (!*pos)
3272                 return SEQ_START_TOKEN;
3273
3274         return unix_get_first(seq, pos);
3275 }
3276
3277 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3278 {
3279         ++*pos;
3280
3281         if (v == SEQ_START_TOKEN)
3282                 return unix_get_first(seq, pos);
3283
3284         return unix_get_next(seq, v, pos);
3285 }
3286
3287 static void unix_seq_stop(struct seq_file *seq, void *v)
3288 {
3289         struct sock *sk = v;
3290
3291         if (sk)
3292                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3293 }
3294
3295 static int unix_seq_show(struct seq_file *seq, void *v)
3296 {
3297
3298         if (v == SEQ_START_TOKEN)
3299                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3300                          "Inode Path\n");
3301         else {
3302                 struct sock *s = v;
3303                 struct unix_sock *u = unix_sk(s);
3304                 unix_state_lock(s);
3305
3306                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3307                         s,
3308                         refcount_read(&s->sk_refcnt),
3309                         0,
3310                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3311                         s->sk_type,
3312                         s->sk_socket ?
3313                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3314                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3315                         sock_i_ino(s));
3316
3317                 if (u->addr) {  // under a hash table lock here
3318                         int i, len;
3319                         seq_putc(seq, ' ');
3320
3321                         i = 0;
3322                         len = u->addr->len -
3323                                 offsetof(struct sockaddr_un, sun_path);
3324                         if (u->addr->name->sun_path[0]) {
3325                                 len--;
3326                         } else {
3327                                 seq_putc(seq, '@');
3328                                 i++;
3329                         }
3330                         for ( ; i < len; i++)
3331                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3332                                          '@');
3333                 }
3334                 unix_state_unlock(s);
3335                 seq_putc(seq, '\n');
3336         }
3337
3338         return 0;
3339 }
3340
3341 static const struct seq_operations unix_seq_ops = {
3342         .start  = unix_seq_start,
3343         .next   = unix_seq_next,
3344         .stop   = unix_seq_stop,
3345         .show   = unix_seq_show,
3346 };
3347
3348 #ifdef CONFIG_BPF_SYSCALL
3349 struct bpf_unix_iter_state {
3350         struct seq_net_private p;
3351         unsigned int cur_sk;
3352         unsigned int end_sk;
3353         unsigned int max_sk;
3354         struct sock **batch;
3355         bool st_bucket_done;
3356 };
3357
3358 struct bpf_iter__unix {
3359         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3360         __bpf_md_ptr(struct unix_sock *, unix_sk);
3361         uid_t uid __aligned(8);
3362 };
3363
3364 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3365                               struct unix_sock *unix_sk, uid_t uid)
3366 {
3367         struct bpf_iter__unix ctx;
3368
3369         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3370         ctx.meta = meta;
3371         ctx.unix_sk = unix_sk;
3372         ctx.uid = uid;
3373         return bpf_iter_run_prog(prog, &ctx);
3374 }
3375
3376 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3377
3378 {
3379         struct bpf_unix_iter_state *iter = seq->private;
3380         unsigned int expected = 1;
3381         struct sock *sk;
3382
3383         sock_hold(start_sk);
3384         iter->batch[iter->end_sk++] = start_sk;
3385
3386         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3387                 if (iter->end_sk < iter->max_sk) {
3388                         sock_hold(sk);
3389                         iter->batch[iter->end_sk++] = sk;
3390                 }
3391
3392                 expected++;
3393         }
3394
3395         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3396
3397         return expected;
3398 }
3399
3400 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3401 {
3402         while (iter->cur_sk < iter->end_sk)
3403                 sock_put(iter->batch[iter->cur_sk++]);
3404 }
3405
3406 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3407                                        unsigned int new_batch_sz)
3408 {
3409         struct sock **new_batch;
3410
3411         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3412                              GFP_USER | __GFP_NOWARN);
3413         if (!new_batch)
3414                 return -ENOMEM;
3415
3416         bpf_iter_unix_put_batch(iter);
3417         kvfree(iter->batch);
3418         iter->batch = new_batch;
3419         iter->max_sk = new_batch_sz;
3420
3421         return 0;
3422 }
3423
3424 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3425                                         loff_t *pos)
3426 {
3427         struct bpf_unix_iter_state *iter = seq->private;
3428         unsigned int expected;
3429         bool resized = false;
3430         struct sock *sk;
3431
3432         if (iter->st_bucket_done)
3433                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3434
3435 again:
3436         /* Get a new batch */
3437         iter->cur_sk = 0;
3438         iter->end_sk = 0;
3439
3440         sk = unix_get_first(seq, pos);
3441         if (!sk)
3442                 return NULL; /* Done */
3443
3444         expected = bpf_iter_unix_hold_batch(seq, sk);
3445
3446         if (iter->end_sk == expected) {
3447                 iter->st_bucket_done = true;
3448                 return sk;
3449         }
3450
3451         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3452                 resized = true;
3453                 goto again;
3454         }
3455
3456         return sk;
3457 }
3458
3459 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3460 {
3461         if (!*pos)
3462                 return SEQ_START_TOKEN;
3463
3464         /* bpf iter does not support lseek, so it always
3465          * continue from where it was stop()-ped.
3466          */
3467         return bpf_iter_unix_batch(seq, pos);
3468 }
3469
3470 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3471 {
3472         struct bpf_unix_iter_state *iter = seq->private;
3473         struct sock *sk;
3474
3475         /* Whenever seq_next() is called, the iter->cur_sk is
3476          * done with seq_show(), so advance to the next sk in
3477          * the batch.
3478          */
3479         if (iter->cur_sk < iter->end_sk)
3480                 sock_put(iter->batch[iter->cur_sk++]);
3481
3482         ++*pos;
3483
3484         if (iter->cur_sk < iter->end_sk)
3485                 sk = iter->batch[iter->cur_sk];
3486         else
3487                 sk = bpf_iter_unix_batch(seq, pos);
3488
3489         return sk;
3490 }
3491
3492 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3493 {
3494         struct bpf_iter_meta meta;
3495         struct bpf_prog *prog;
3496         struct sock *sk = v;
3497         uid_t uid;
3498         bool slow;
3499         int ret;
3500
3501         if (v == SEQ_START_TOKEN)
3502                 return 0;
3503
3504         slow = lock_sock_fast(sk);
3505
3506         if (unlikely(sk_unhashed(sk))) {
3507                 ret = SEQ_SKIP;
3508                 goto unlock;
3509         }
3510
3511         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3512         meta.seq = seq;
3513         prog = bpf_iter_get_info(&meta, false);
3514         ret = unix_prog_seq_show(prog, &meta, v, uid);
3515 unlock:
3516         unlock_sock_fast(sk, slow);
3517         return ret;
3518 }
3519
3520 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3521 {
3522         struct bpf_unix_iter_state *iter = seq->private;
3523         struct bpf_iter_meta meta;
3524         struct bpf_prog *prog;
3525
3526         if (!v) {
3527                 meta.seq = seq;
3528                 prog = bpf_iter_get_info(&meta, true);
3529                 if (prog)
3530                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3531         }
3532
3533         if (iter->cur_sk < iter->end_sk)
3534                 bpf_iter_unix_put_batch(iter);
3535 }
3536
3537 static const struct seq_operations bpf_iter_unix_seq_ops = {
3538         .start  = bpf_iter_unix_seq_start,
3539         .next   = bpf_iter_unix_seq_next,
3540         .stop   = bpf_iter_unix_seq_stop,
3541         .show   = bpf_iter_unix_seq_show,
3542 };
3543 #endif
3544 #endif
3545
3546 static const struct net_proto_family unix_family_ops = {
3547         .family = PF_UNIX,
3548         .create = unix_create,
3549         .owner  = THIS_MODULE,
3550 };
3551
3552
3553 static int __net_init unix_net_init(struct net *net)
3554 {
3555         int i;
3556
3557         net->unx.sysctl_max_dgram_qlen = 10;
3558         if (unix_sysctl_register(net))
3559                 goto out;
3560
3561 #ifdef CONFIG_PROC_FS
3562         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3563                              sizeof(struct seq_net_private)))
3564                 goto err_sysctl;
3565 #endif
3566
3567         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3568                                               sizeof(spinlock_t), GFP_KERNEL);
3569         if (!net->unx.table.locks)
3570                 goto err_proc;
3571
3572         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3573                                                 sizeof(struct hlist_head),
3574                                                 GFP_KERNEL);
3575         if (!net->unx.table.buckets)
3576                 goto free_locks;
3577
3578         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3579                 spin_lock_init(&net->unx.table.locks[i]);
3580                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3581         }
3582
3583         return 0;
3584
3585 free_locks:
3586         kvfree(net->unx.table.locks);
3587 err_proc:
3588 #ifdef CONFIG_PROC_FS
3589         remove_proc_entry("unix", net->proc_net);
3590 err_sysctl:
3591 #endif
3592         unix_sysctl_unregister(net);
3593 out:
3594         return -ENOMEM;
3595 }
3596
3597 static void __net_exit unix_net_exit(struct net *net)
3598 {
3599         kvfree(net->unx.table.buckets);
3600         kvfree(net->unx.table.locks);
3601         unix_sysctl_unregister(net);
3602         remove_proc_entry("unix", net->proc_net);
3603 }
3604
3605 static struct pernet_operations unix_net_ops = {
3606         .init = unix_net_init,
3607         .exit = unix_net_exit,
3608 };
3609
3610 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3611 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3612                      struct unix_sock *unix_sk, uid_t uid)
3613
3614 #define INIT_BATCH_SZ 16
3615
3616 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3617 {
3618         struct bpf_unix_iter_state *iter = priv_data;
3619         int err;
3620
3621         err = bpf_iter_init_seq_net(priv_data, aux);
3622         if (err)
3623                 return err;
3624
3625         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3626         if (err) {
3627                 bpf_iter_fini_seq_net(priv_data);
3628                 return err;
3629         }
3630
3631         return 0;
3632 }
3633
3634 static void bpf_iter_fini_unix(void *priv_data)
3635 {
3636         struct bpf_unix_iter_state *iter = priv_data;
3637
3638         bpf_iter_fini_seq_net(priv_data);
3639         kvfree(iter->batch);
3640 }
3641
3642 static const struct bpf_iter_seq_info unix_seq_info = {
3643         .seq_ops                = &bpf_iter_unix_seq_ops,
3644         .init_seq_private       = bpf_iter_init_unix,
3645         .fini_seq_private       = bpf_iter_fini_unix,
3646         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3647 };
3648
3649 static const struct bpf_func_proto *
3650 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3651                              const struct bpf_prog *prog)
3652 {
3653         switch (func_id) {
3654         case BPF_FUNC_setsockopt:
3655                 return &bpf_sk_setsockopt_proto;
3656         case BPF_FUNC_getsockopt:
3657                 return &bpf_sk_getsockopt_proto;
3658         default:
3659                 return NULL;
3660         }
3661 }
3662
3663 static struct bpf_iter_reg unix_reg_info = {
3664         .target                 = "unix",
3665         .ctx_arg_info_size      = 1,
3666         .ctx_arg_info           = {
3667                 { offsetof(struct bpf_iter__unix, unix_sk),
3668                   PTR_TO_BTF_ID_OR_NULL },
3669         },
3670         .get_func_proto         = bpf_iter_unix_get_func_proto,
3671         .seq_info               = &unix_seq_info,
3672 };
3673
3674 static void __init bpf_iter_register(void)
3675 {
3676         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3677         if (bpf_iter_reg_target(&unix_reg_info))
3678                 pr_warn("Warning: could not register bpf iterator unix\n");
3679 }
3680 #endif
3681
3682 static int __init af_unix_init(void)
3683 {
3684         int i, rc = -1;
3685
3686         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3687
3688         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3689                 spin_lock_init(&bsd_socket_locks[i]);
3690                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3691         }
3692
3693         rc = proto_register(&unix_dgram_proto, 1);
3694         if (rc != 0) {
3695                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3696                 goto out;
3697         }
3698
3699         rc = proto_register(&unix_stream_proto, 1);
3700         if (rc != 0) {
3701                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3702                 proto_unregister(&unix_dgram_proto);
3703                 goto out;
3704         }
3705
3706         sock_register(&unix_family_ops);
3707         register_pernet_subsys(&unix_net_ops);
3708         unix_bpf_build_proto();
3709
3710 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711         bpf_iter_register();
3712 #endif
3713
3714 out:
3715         return rc;
3716 }
3717
3718 /* Later than subsys_initcall() because we depend on stuff initialised there */
3719 fs_initcall(af_unix_init);