net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <[email protected]>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119 #include <linux/bpf-cgroup.h>
 120
 121 static atomic_long_t unix_nr_socks;
 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 124
 125 /* SMP locking strategy:
 126  *    hash table is protected with spinlock.
 127  *    each socket state is protected by separate spinlock.
 128  */
 129
 130 static unsigned int unix_unbound_hash(struct sock *sk)
 131 {
 132         unsigned long hash = (unsigned long)sk;
 133
 134         hash ^= hash >> 16;
 135         hash ^= hash >> 8;
 136         hash ^= sk->sk_type;
 137
 138         return hash & UNIX_HASH_MOD;
 139 }
 140
 141 static unsigned int unix_bsd_hash(struct inode *i)
 142 {
 143         return i->i_ino & UNIX_HASH_MOD;
 144 }
 145
 146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 147                                        int addr_len, int type)
 148 {
 149         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 150         unsigned int hash;
 151
 152         hash = (__force unsigned int)csum_fold(csum);
 153         hash ^= hash >> 8;
 154         hash ^= type;
 155
 156         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 157 }
 158
 159 static void unix_table_double_lock(struct net *net,
 160                                    unsigned int hash1, unsigned int hash2)
 161 {
 162         if (hash1 == hash2) {
 163                 spin_lock(&net->unx.table.locks[hash1]);
 164                 return;
 165         }
 166
 167         if (hash1 > hash2)
 168                 swap(hash1, hash2);
 169
 170         spin_lock(&net->unx.table.locks[hash1]);
 171         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 172 }
 173
 174 static void unix_table_double_unlock(struct net *net,
 175                                      unsigned int hash1, unsigned int hash2)
 176 {
 177         if (hash1 == hash2) {
 178                 spin_unlock(&net->unx.table.locks[hash1]);
 179                 return;
 180         }
 181
 182         spin_unlock(&net->unx.table.locks[hash1]);
 183         spin_unlock(&net->unx.table.locks[hash2]);
 184 }
 185
 186 #ifdef CONFIG_SECURITY_NETWORK
 187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 188 {
 189         UNIXCB(skb).secid = scm->secid;
 190 }
 191
 192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 193 {
 194         scm->secid = UNIXCB(skb).secid;
 195 }
 196
 197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 198 {
 199         return (scm->secid == UNIXCB(skb).secid);
 200 }
 201 #else
 202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 203 { }
 204
 205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 206 { }
 207
 208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 209 {
 210         return true;
 211 }
 212 #endif /* CONFIG_SECURITY_NETWORK */
 213
 214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 215 {
 216         return unix_peer(osk) == sk;
 217 }
 218
 219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 220 {
 221         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 222 }
 223
 224 static inline int unix_recvq_full_lockless(const struct sock *sk)
 225 {
 226         return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 227 }
 228
 229 struct sock *unix_peer_get(struct sock *s)
 230 {
 231         struct sock *peer;
 232
 233         unix_state_lock(s);
 234         peer = unix_peer(s);
 235         if (peer)
 236                 sock_hold(peer);
 237         unix_state_unlock(s);
 238         return peer;
 239 }
 240 EXPORT_SYMBOL_GPL(unix_peer_get);
 241
 242 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 243                                              int addr_len)
 244 {
 245         struct unix_address *addr;
 246
 247         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 248         if (!addr)
 249                 return NULL;
 250
 251         refcount_set(&addr->refcnt, 1);
 252         addr->len = addr_len;
 253         memcpy(addr->name, sunaddr, addr_len);
 254
 255         return addr;
 256 }
 257
 258 static inline void unix_release_addr(struct unix_address *addr)
 259 {
 260         if (refcount_dec_and_test(&addr->refcnt))
 261                 kfree(addr);
 262 }
 263
 264 /*
 265  *      Check unix socket name:
 266  *              - should be not zero length.
 267  *              - if started by not zero, should be NULL terminated (FS object)
 268  *              - if started by zero, it is abstract name.
 269  */
 270
 271 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 272 {
 273         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 274             addr_len > sizeof(*sunaddr))
 275                 return -EINVAL;
 276
 277         if (sunaddr->sun_family != AF_UNIX)
 278                 return -EINVAL;
 279
 280         return 0;
 281 }
 282
 283 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 284 {
 285         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 286         short offset = offsetof(struct sockaddr_storage, __data);
 287
 288         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 289
 290         /* This may look like an off by one error but it is a bit more
 291          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 292          * sun_path[108] doesn't as such exist.  However in kernel space
 293          * we are guaranteed that it is a valid memory location in our
 294          * kernel address buffer because syscall functions always pass
 295          * a pointer of struct sockaddr_storage which has a bigger buffer
 296          * than 108.  Also, we must terminate sun_path for strlen() in
 297          * getname_kernel().
 298          */
 299         addr->__data[addr_len - offset] = 0;
 300
 301         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 302          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 303          * know the actual buffer.
 304          */
 305         return strlen(addr->__data) + offset + 1;
 306 }
 307
 308 static void __unix_remove_socket(struct sock *sk)
 309 {
 310         sk_del_node_init(sk);
 311 }
 312
 313 static void __unix_insert_socket(struct net *net, struct sock *sk)
 314 {
 315         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 316         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 317 }
 318
 319 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 320                                  struct unix_address *addr, unsigned int hash)
 321 {
 322         __unix_remove_socket(sk);
 323         smp_store_release(&unix_sk(sk)->addr, addr);
 324
 325         sk->sk_hash = hash;
 326         __unix_insert_socket(net, sk);
 327 }
 328
 329 static void unix_remove_socket(struct net *net, struct sock *sk)
 330 {
 331         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 332         __unix_remove_socket(sk);
 333         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 334 }
 335
 336 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 337 {
 338         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 339         __unix_insert_socket(net, sk);
 340         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 341 }
 342
 343 static void unix_insert_bsd_socket(struct sock *sk)
 344 {
 345         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 346         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 347         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 348 }
 349
 350 static void unix_remove_bsd_socket(struct sock *sk)
 351 {
 352         if (!hlist_unhashed(&sk->sk_bind_node)) {
 353                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 354                 __sk_del_bind_node(sk);
 355                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 356
 357                 sk_node_init(&sk->sk_bind_node);
 358         }
 359 }
 360
 361 static struct sock *__unix_find_socket_byname(struct net *net,
 362                                               struct sockaddr_un *sunname,
 363                                               int len, unsigned int hash)
 364 {
 365         struct sock *s;
 366
 367         sk_for_each(s, &net->unx.table.buckets[hash]) {
 368                 struct unix_sock *u = unix_sk(s);
 369
 370                 if (u->addr->len == len &&
 371                     !memcmp(u->addr->name, sunname, len))
 372                         return s;
 373         }
 374         return NULL;
 375 }
 376
 377 static inline struct sock *unix_find_socket_byname(struct net *net,
 378                                                    struct sockaddr_un *sunname,
 379                                                    int len, unsigned int hash)
 380 {
 381         struct sock *s;
 382
 383         spin_lock(&net->unx.table.locks[hash]);
 384         s = __unix_find_socket_byname(net, sunname, len, hash);
 385         if (s)
 386                 sock_hold(s);
 387         spin_unlock(&net->unx.table.locks[hash]);
 388         return s;
 389 }
 390
 391 static struct sock *unix_find_socket_byinode(struct inode *i)
 392 {
 393         unsigned int hash = unix_bsd_hash(i);
 394         struct sock *s;
 395
 396         spin_lock(&bsd_socket_locks[hash]);
 397         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 398                 struct dentry *dentry = unix_sk(s)->path.dentry;
 399
 400                 if (dentry && d_backing_inode(dentry) == i) {
 401                         sock_hold(s);
 402                         spin_unlock(&bsd_socket_locks[hash]);
 403                         return s;
 404                 }
 405         }
 406         spin_unlock(&bsd_socket_locks[hash]);
 407         return NULL;
 408 }
 409
 410 /* Support code for asymmetrically connected dgram sockets
 411  *
 412  * If a datagram socket is connected to a socket not itself connected
 413  * to the first socket (eg, /dev/log), clients may only enqueue more
 414  * messages if the present receive queue of the server socket is not
 415  * "too large". This means there's a second writeability condition
 416  * poll and sendmsg need to test. The dgram recv code will do a wake
 417  * up on the peer_wait wait queue of a socket upon reception of a
 418  * datagram which needs to be propagated to sleeping would-be writers
 419  * since these might not have sent anything so far. This can't be
 420  * accomplished via poll_wait because the lifetime of the server
 421  * socket might be less than that of its clients if these break their
 422  * association with it or if the server socket is closed while clients
 423  * are still connected to it and there's no way to inform "a polling
 424  * implementation" that it should let go of a certain wait queue
 425  *
 426  * In order to propagate a wake up, a wait_queue_entry_t of the client
 427  * socket is enqueued on the peer_wait queue of the server socket
 428  * whose wake function does a wake_up on the ordinary client socket
 429  * wait queue. This connection is established whenever a write (or
 430  * poll for write) hit the flow control condition and broken when the
 431  * association to the server socket is dissolved or after a wake up
 432  * was relayed.
 433  */
 434
 435 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 436                                       void *key)
 437 {
 438         struct unix_sock *u;
 439         wait_queue_head_t *u_sleep;
 440
 441         u = container_of(q, struct unix_sock, peer_wake);
 442
 443         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 444                             q);
 445         u->peer_wake.private = NULL;
 446
 447         /* relaying can only happen while the wq still exists */
 448         u_sleep = sk_sleep(&u->sk);
 449         if (u_sleep)
 450                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 451
 452         return 0;
 453 }
 454
 455 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 456 {
 457         struct unix_sock *u, *u_other;
 458         int rc;
 459
 460         u = unix_sk(sk);
 461         u_other = unix_sk(other);
 462         rc = 0;
 463         spin_lock(&u_other->peer_wait.lock);
 464
 465         if (!u->peer_wake.private) {
 466                 u->peer_wake.private = other;
 467                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 468
 469                 rc = 1;
 470         }
 471
 472         spin_unlock(&u_other->peer_wait.lock);
 473         return rc;
 474 }
 475
 476 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 477                                             struct sock *other)
 478 {
 479         struct unix_sock *u, *u_other;
 480
 481         u = unix_sk(sk);
 482         u_other = unix_sk(other);
 483         spin_lock(&u_other->peer_wait.lock);
 484
 485         if (u->peer_wake.private == other) {
 486                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 487                 u->peer_wake.private = NULL;
 488         }
 489
 490         spin_unlock(&u_other->peer_wait.lock);
 491 }
 492
 493 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 494                                                    struct sock *other)
 495 {
 496         unix_dgram_peer_wake_disconnect(sk, other);
 497         wake_up_interruptible_poll(sk_sleep(sk),
 498                                    EPOLLOUT |
 499                                    EPOLLWRNORM |
 500                                    EPOLLWRBAND);
 501 }
 502
 503 /* preconditions:
 504  *      - unix_peer(sk) == other
 505  *      - association is stable
 506  */
 507 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 508 {
 509         int connected;
 510
 511         connected = unix_dgram_peer_wake_connect(sk, other);
 512
 513         /* If other is SOCK_DEAD, we want to make sure we signal
 514          * POLLOUT, such that a subsequent write() can get a
 515          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 516          * to other and its full, we will hang waiting for POLLOUT.
 517          */
 518         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 519                 return 1;
 520
 521         if (connected)
 522                 unix_dgram_peer_wake_disconnect(sk, other);
 523
 524         return 0;
 525 }
 526
 527 static int unix_writable(const struct sock *sk, unsigned char state)
 528 {
 529         return state != TCP_LISTEN &&
 530                 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
 531 }
 532
 533 static void unix_write_space(struct sock *sk)
 534 {
 535         struct socket_wq *wq;
 536
 537         rcu_read_lock();
 538         if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
 539                 wq = rcu_dereference(sk->sk_wq);
 540                 if (skwq_has_sleeper(wq))
 541                         wake_up_interruptible_sync_poll(&wq->wait,
 542                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 543                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 544         }
 545         rcu_read_unlock();
 546 }
 547
 548 /* When dgram socket disconnects (or changes its peer), we clear its receive
 549  * queue of packets arrived from previous peer. First, it allows to do
 550  * flow control based only on wmem_alloc; second, sk connected to peer
 551  * may receive messages only from that peer. */
 552 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 553 {
 554         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 555                 skb_queue_purge(&sk->sk_receive_queue);
 556                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 557
 558                 /* If one link of bidirectional dgram pipe is disconnected,
 559                  * we signal error. Messages are lost. Do not make this,
 560                  * when peer was not connected to us.
 561                  */
 562                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 563                         WRITE_ONCE(other->sk_err, ECONNRESET);
 564                         sk_error_report(other);
 565                 }
 566         }
 567 }
 568
 569 static void unix_sock_destructor(struct sock *sk)
 570 {
 571         struct unix_sock *u = unix_sk(sk);
 572
 573         skb_queue_purge(&sk->sk_receive_queue);
 574
 575         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 576         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 577         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 578         if (!sock_flag(sk, SOCK_DEAD)) {
 579                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 580                 return;
 581         }
 582
 583         if (u->addr)
 584                 unix_release_addr(u->addr);
 585
 586         atomic_long_dec(&unix_nr_socks);
 587         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 588 #ifdef UNIX_REFCNT_DEBUG
 589         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 590                 atomic_long_read(&unix_nr_socks));
 591 #endif
 592 }
 593
 594 static void unix_release_sock(struct sock *sk, int embrion)
 595 {
 596         struct unix_sock *u = unix_sk(sk);
 597         struct sock *skpair;
 598         struct sk_buff *skb;
 599         struct path path;
 600         int state;
 601
 602         unix_remove_socket(sock_net(sk), sk);
 603         unix_remove_bsd_socket(sk);
 604
 605         /* Clear state */
 606         unix_state_lock(sk);
 607         sock_orphan(sk);
 608         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 609         path         = u->path;
 610         u->path.dentry = NULL;
 611         u->path.mnt = NULL;
 612         state = sk->sk_state;
 613         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
 614
 615         skpair = unix_peer(sk);
 616         unix_peer(sk) = NULL;
 617
 618         unix_state_unlock(sk);
 619
 620 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 621         if (u->oob_skb) {
 622                 kfree_skb(u->oob_skb);
 623                 u->oob_skb = NULL;
 624         }
 625 #endif
 626
 627         wake_up_interruptible_all(&u->peer_wait);
 628
 629         if (skpair != NULL) {
 630                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 631                         unix_state_lock(skpair);
 632                         /* No more writes */
 633                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 634                         if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
 635                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 636                         unix_state_unlock(skpair);
 637                         skpair->sk_state_change(skpair);
 638                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 639                 }
 640
 641                 unix_dgram_peer_wake_disconnect(sk, skpair);
 642                 sock_put(skpair); /* It may now die */
 643         }
 644
 645         /* Try to flush out this socket. Throw out buffers at least */
 646
 647         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 648                 if (state == TCP_LISTEN)
 649                         unix_release_sock(skb->sk, 1);
 650                 /* passed fds are erased in the kfree_skb hook        */
 651                 UNIXCB(skb).consumed = skb->len;
 652                 kfree_skb(skb);
 653         }
 654
 655         if (path.dentry)
 656                 path_put(&path);
 657
 658         sock_put(sk);
 659
 660         /* ---- Socket is dead now and most probably destroyed ---- */
 661
 662         /*
 663          * Fixme: BSD difference: In BSD all sockets connected to us get
 664          *        ECONNRESET and we die on the spot. In Linux we behave
 665          *        like files and pipes do and wait for the last
 666          *        dereference.
 667          *
 668          * Can't we simply set sock->err?
 669          *
 670          *        What the above comment does talk about? --ANK(980817)
 671          */
 672
 673         if (READ_ONCE(unix_tot_inflight))
 674                 unix_gc();              /* Garbage collect fds */
 675 }
 676
 677 static void init_peercred(struct sock *sk)
 678 {
 679         const struct cred *old_cred;
 680         struct pid *old_pid;
 681
 682         spin_lock(&sk->sk_peer_lock);
 683         old_pid = sk->sk_peer_pid;
 684         old_cred = sk->sk_peer_cred;
 685         sk->sk_peer_pid  = get_pid(task_tgid(current));
 686         sk->sk_peer_cred = get_current_cred();
 687         spin_unlock(&sk->sk_peer_lock);
 688
 689         put_pid(old_pid);
 690         put_cred(old_cred);
 691 }
 692
 693 static void copy_peercred(struct sock *sk, struct sock *peersk)
 694 {
 695         const struct cred *old_cred;
 696         struct pid *old_pid;
 697
 698         if (sk < peersk) {
 699                 spin_lock(&sk->sk_peer_lock);
 700                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 701         } else {
 702                 spin_lock(&peersk->sk_peer_lock);
 703                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 704         }
 705         old_pid = sk->sk_peer_pid;
 706         old_cred = sk->sk_peer_cred;
 707         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 708         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 709
 710         spin_unlock(&sk->sk_peer_lock);
 711         spin_unlock(&peersk->sk_peer_lock);
 712
 713         put_pid(old_pid);
 714         put_cred(old_cred);
 715 }
 716
 717 static int unix_listen(struct socket *sock, int backlog)
 718 {
 719         int err;
 720         struct sock *sk = sock->sk;
 721         struct unix_sock *u = unix_sk(sk);
 722
 723         err = -EOPNOTSUPP;
 724         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 725                 goto out;       /* Only stream/seqpacket sockets accept */
 726         err = -EINVAL;
 727         if (!READ_ONCE(u->addr))
 728                 goto out;       /* No listens on an unbound socket */
 729         unix_state_lock(sk);
 730         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 731                 goto out_unlock;
 732         if (backlog > sk->sk_max_ack_backlog)
 733                 wake_up_interruptible_all(&u->peer_wait);
 734         sk->sk_max_ack_backlog  = backlog;
 735         WRITE_ONCE(sk->sk_state, TCP_LISTEN);
 736
 737         /* set credentials so connect can copy them */
 738         init_peercred(sk);
 739         err = 0;
 740
 741 out_unlock:
 742         unix_state_unlock(sk);
 743 out:
 744         return err;
 745 }
 746
 747 static int unix_release(struct socket *);
 748 static int unix_bind(struct socket *, struct sockaddr *, int);
 749 static int unix_stream_connect(struct socket *, struct sockaddr *,
 750                                int addr_len, int flags);
 751 static int unix_socketpair(struct socket *, struct socket *);
 752 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
 753 static int unix_getname(struct socket *, struct sockaddr *, int);
 754 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 755 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 756                                     poll_table *);
 757 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 758 #ifdef CONFIG_COMPAT
 759 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 760 #endif
 761 static int unix_shutdown(struct socket *, int);
 762 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 763 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 764 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 765                                        struct pipe_inode_info *, size_t size,
 766                                        unsigned int flags);
 767 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 768 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 769 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 770 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 771 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 772                               int, int);
 773 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 774 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 775                                   int);
 776
 777 #ifdef CONFIG_PROC_FS
 778 static int unix_count_nr_fds(struct sock *sk)
 779 {
 780         struct sk_buff *skb;
 781         struct unix_sock *u;
 782         int nr_fds = 0;
 783
 784         spin_lock(&sk->sk_receive_queue.lock);
 785         skb = skb_peek(&sk->sk_receive_queue);
 786         while (skb) {
 787                 u = unix_sk(skb->sk);
 788                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 789                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 790         }
 791         spin_unlock(&sk->sk_receive_queue.lock);
 792
 793         return nr_fds;
 794 }
 795
 796 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 797 {
 798         struct sock *sk = sock->sk;
 799         unsigned char s_state;
 800         struct unix_sock *u;
 801         int nr_fds = 0;
 802
 803         if (sk) {
 804                 s_state = READ_ONCE(sk->sk_state);
 805                 u = unix_sk(sk);
 806
 807                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 808                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 809                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 810                  */
 811                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 812                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 813                 else if (s_state == TCP_LISTEN)
 814                         nr_fds = unix_count_nr_fds(sk);
 815
 816                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 817         }
 818 }
 819 #else
 820 #define unix_show_fdinfo NULL
 821 #endif
 822
 823 static const struct proto_ops unix_stream_ops = {
 824         .family =       PF_UNIX,
 825         .owner =        THIS_MODULE,
 826         .release =      unix_release,
 827         .bind =         unix_bind,
 828         .connect =      unix_stream_connect,
 829         .socketpair =   unix_socketpair,
 830         .accept =       unix_accept,
 831         .getname =      unix_getname,
 832         .poll =         unix_poll,
 833         .ioctl =        unix_ioctl,
 834 #ifdef CONFIG_COMPAT
 835         .compat_ioctl = unix_compat_ioctl,
 836 #endif
 837         .listen =       unix_listen,
 838         .shutdown =     unix_shutdown,
 839         .sendmsg =      unix_stream_sendmsg,
 840         .recvmsg =      unix_stream_recvmsg,
 841         .read_skb =     unix_stream_read_skb,
 842         .mmap =         sock_no_mmap,
 843         .splice_read =  unix_stream_splice_read,
 844         .set_peek_off = sk_set_peek_off,
 845         .show_fdinfo =  unix_show_fdinfo,
 846 };
 847
 848 static const struct proto_ops unix_dgram_ops = {
 849         .family =       PF_UNIX,
 850         .owner =        THIS_MODULE,
 851         .release =      unix_release,
 852         .bind =         unix_bind,
 853         .connect =      unix_dgram_connect,
 854         .socketpair =   unix_socketpair,
 855         .accept =       sock_no_accept,
 856         .getname =      unix_getname,
 857         .poll =         unix_dgram_poll,
 858         .ioctl =        unix_ioctl,
 859 #ifdef CONFIG_COMPAT
 860         .compat_ioctl = unix_compat_ioctl,
 861 #endif
 862         .listen =       sock_no_listen,
 863         .shutdown =     unix_shutdown,
 864         .sendmsg =      unix_dgram_sendmsg,
 865         .read_skb =     unix_read_skb,
 866         .recvmsg =      unix_dgram_recvmsg,
 867         .mmap =         sock_no_mmap,
 868         .set_peek_off = sk_set_peek_off,
 869         .show_fdinfo =  unix_show_fdinfo,
 870 };
 871
 872 static const struct proto_ops unix_seqpacket_ops = {
 873         .family =       PF_UNIX,
 874         .owner =        THIS_MODULE,
 875         .release =      unix_release,
 876         .bind =         unix_bind,
 877         .connect =      unix_stream_connect,
 878         .socketpair =   unix_socketpair,
 879         .accept =       unix_accept,
 880         .getname =      unix_getname,
 881         .poll =         unix_dgram_poll,
 882         .ioctl =        unix_ioctl,
 883 #ifdef CONFIG_COMPAT
 884         .compat_ioctl = unix_compat_ioctl,
 885 #endif
 886         .listen =       unix_listen,
 887         .shutdown =     unix_shutdown,
 888         .sendmsg =      unix_seqpacket_sendmsg,
 889         .recvmsg =      unix_seqpacket_recvmsg,
 890         .mmap =         sock_no_mmap,
 891         .set_peek_off = sk_set_peek_off,
 892         .show_fdinfo =  unix_show_fdinfo,
 893 };
 894
 895 static void unix_close(struct sock *sk, long timeout)
 896 {
 897         /* Nothing to do here, unix socket does not need a ->close().
 898          * This is merely for sockmap.
 899          */
 900 }
 901
 902 static void unix_unhash(struct sock *sk)
 903 {
 904         /* Nothing to do here, unix socket does not need a ->unhash().
 905          * This is merely for sockmap.
 906          */
 907 }
 908
 909 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 910 {
 911         if (level == SOL_SOCKET) {
 912                 switch (optname) {
 913                 case SO_PEERPIDFD:
 914                         return true;
 915                 default:
 916                         return false;
 917                 }
 918         }
 919
 920         return false;
 921 }
 922
 923 struct proto unix_dgram_proto = {
 924         .name                   = "UNIX",
 925         .owner                  = THIS_MODULE,
 926         .obj_size               = sizeof(struct unix_sock),
 927         .close                  = unix_close,
 928         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 929 #ifdef CONFIG_BPF_SYSCALL
 930         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 931 #endif
 932 };
 933
 934 struct proto unix_stream_proto = {
 935         .name                   = "UNIX-STREAM",
 936         .owner                  = THIS_MODULE,
 937         .obj_size               = sizeof(struct unix_sock),
 938         .close                  = unix_close,
 939         .unhash                 = unix_unhash,
 940         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 941 #ifdef CONFIG_BPF_SYSCALL
 942         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 943 #endif
 944 };
 945
 946 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 947 {
 948         struct unix_sock *u;
 949         struct sock *sk;
 950         int err;
 951
 952         atomic_long_inc(&unix_nr_socks);
 953         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 954                 err = -ENFILE;
 955                 goto err;
 956         }
 957
 958         if (type == SOCK_STREAM)
 959                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 960         else /*dgram and  seqpacket */
 961                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 962
 963         if (!sk) {
 964                 err = -ENOMEM;
 965                 goto err;
 966         }
 967
 968         sock_init_data(sock, sk);
 969
 970         sk->sk_hash             = unix_unbound_hash(sk);
 971         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 972         sk->sk_write_space      = unix_write_space;
 973         sk->sk_max_ack_backlog  = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
 974         sk->sk_destruct         = unix_sock_destructor;
 975         u = unix_sk(sk);
 976         u->listener = NULL;
 977         u->vertex = NULL;
 978         u->path.dentry = NULL;
 979         u->path.mnt = NULL;
 980         spin_lock_init(&u->lock);
 981         mutex_init(&u->iolock); /* single task reading lock */
 982         mutex_init(&u->bindlock); /* single task binding lock */
 983         init_waitqueue_head(&u->peer_wait);
 984         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 985         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 986         unix_insert_unbound_socket(net, sk);
 987
 988         sock_prot_inuse_add(net, sk->sk_prot, 1);
 989
 990         return sk;
 991
 992 err:
 993         atomic_long_dec(&unix_nr_socks);
 994         return ERR_PTR(err);
 995 }
 996
 997 static int unix_create(struct net *net, struct socket *sock, int protocol,
 998                        int kern)
 999 {
1000         struct sock *sk;
1001
1002         if (protocol && protocol != PF_UNIX)
1003                 return -EPROTONOSUPPORT;
1004
1005         sock->state = SS_UNCONNECTED;
1006
1007         switch (sock->type) {
1008         case SOCK_STREAM:
1009                 sock->ops = &unix_stream_ops;
1010                 break;
1011                 /*
1012                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1013                  *      nothing uses it.
1014                  */
1015         case SOCK_RAW:
1016                 sock->type = SOCK_DGRAM;
1017                 fallthrough;
1018         case SOCK_DGRAM:
1019                 sock->ops = &unix_dgram_ops;
1020                 break;
1021         case SOCK_SEQPACKET:
1022                 sock->ops = &unix_seqpacket_ops;
1023                 break;
1024         default:
1025                 return -ESOCKTNOSUPPORT;
1026         }
1027
1028         sk = unix_create1(net, sock, kern, sock->type);
1029         if (IS_ERR(sk))
1030                 return PTR_ERR(sk);
1031
1032         return 0;
1033 }
1034
1035 static int unix_release(struct socket *sock)
1036 {
1037         struct sock *sk = sock->sk;
1038
1039         if (!sk)
1040                 return 0;
1041
1042         sk->sk_prot->close(sk, 0);
1043         unix_release_sock(sk, 0);
1044         sock->sk = NULL;
1045
1046         return 0;
1047 }
1048
1049 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1050                                   int type)
1051 {
1052         struct inode *inode;
1053         struct path path;
1054         struct sock *sk;
1055         int err;
1056
1057         unix_mkname_bsd(sunaddr, addr_len);
1058         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1059         if (err)
1060                 goto fail;
1061
1062         err = path_permission(&path, MAY_WRITE);
1063         if (err)
1064                 goto path_put;
1065
1066         err = -ECONNREFUSED;
1067         inode = d_backing_inode(path.dentry);
1068         if (!S_ISSOCK(inode->i_mode))
1069                 goto path_put;
1070
1071         sk = unix_find_socket_byinode(inode);
1072         if (!sk)
1073                 goto path_put;
1074
1075         err = -EPROTOTYPE;
1076         if (sk->sk_type == type)
1077                 touch_atime(&path);
1078         else
1079                 goto sock_put;
1080
1081         path_put(&path);
1082
1083         return sk;
1084
1085 sock_put:
1086         sock_put(sk);
1087 path_put:
1088         path_put(&path);
1089 fail:
1090         return ERR_PTR(err);
1091 }
1092
1093 static struct sock *unix_find_abstract(struct net *net,
1094                                        struct sockaddr_un *sunaddr,
1095                                        int addr_len, int type)
1096 {
1097         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1098         struct dentry *dentry;
1099         struct sock *sk;
1100
1101         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1102         if (!sk)
1103                 return ERR_PTR(-ECONNREFUSED);
1104
1105         dentry = unix_sk(sk)->path.dentry;
1106         if (dentry)
1107                 touch_atime(&unix_sk(sk)->path);
1108
1109         return sk;
1110 }
1111
1112 static struct sock *unix_find_other(struct net *net,
1113                                     struct sockaddr_un *sunaddr,
1114                                     int addr_len, int type)
1115 {
1116         struct sock *sk;
1117
1118         if (sunaddr->sun_path[0])
1119                 sk = unix_find_bsd(sunaddr, addr_len, type);
1120         else
1121                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1122
1123         return sk;
1124 }
1125
1126 static int unix_autobind(struct sock *sk)
1127 {
1128         struct unix_sock *u = unix_sk(sk);
1129         unsigned int new_hash, old_hash;
1130         struct net *net = sock_net(sk);
1131         struct unix_address *addr;
1132         u32 lastnum, ordernum;
1133         int err;
1134
1135         err = mutex_lock_interruptible(&u->bindlock);
1136         if (err)
1137                 return err;
1138
1139         if (u->addr)
1140                 goto out;
1141
1142         err = -ENOMEM;
1143         addr = kzalloc(sizeof(*addr) +
1144                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1145         if (!addr)
1146                 goto out;
1147
1148         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1149         addr->name->sun_family = AF_UNIX;
1150         refcount_set(&addr->refcnt, 1);
1151
1152         old_hash = sk->sk_hash;
1153         ordernum = get_random_u32();
1154         lastnum = ordernum & 0xFFFFF;
1155 retry:
1156         ordernum = (ordernum + 1) & 0xFFFFF;
1157         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1158
1159         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1160         unix_table_double_lock(net, old_hash, new_hash);
1161
1162         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1163                 unix_table_double_unlock(net, old_hash, new_hash);
1164
1165                 /* __unix_find_socket_byname() may take long time if many names
1166                  * are already in use.
1167                  */
1168                 cond_resched();
1169
1170                 if (ordernum == lastnum) {
1171                         /* Give up if all names seems to be in use. */
1172                         err = -ENOSPC;
1173                         unix_release_addr(addr);
1174                         goto out;
1175                 }
1176
1177                 goto retry;
1178         }
1179
1180         __unix_set_addr_hash(net, sk, addr, new_hash);
1181         unix_table_double_unlock(net, old_hash, new_hash);
1182         err = 0;
1183
1184 out:    mutex_unlock(&u->bindlock);
1185         return err;
1186 }
1187
1188 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1189                          int addr_len)
1190 {
1191         umode_t mode = S_IFSOCK |
1192                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1193         struct unix_sock *u = unix_sk(sk);
1194         unsigned int new_hash, old_hash;
1195         struct net *net = sock_net(sk);
1196         struct mnt_idmap *idmap;
1197         struct unix_address *addr;
1198         struct dentry *dentry;
1199         struct path parent;
1200         int err;
1201
1202         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1203         addr = unix_create_addr(sunaddr, addr_len);
1204         if (!addr)
1205                 return -ENOMEM;
1206
1207         /*
1208          * Get the parent directory, calculate the hash for last
1209          * component.
1210          */
1211         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1212         if (IS_ERR(dentry)) {
1213                 err = PTR_ERR(dentry);
1214                 goto out;
1215         }
1216
1217         /*
1218          * All right, let's create it.
1219          */
1220         idmap = mnt_idmap(parent.mnt);
1221         err = security_path_mknod(&parent, dentry, mode, 0);
1222         if (!err)
1223                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1224         if (err)
1225                 goto out_path;
1226         err = mutex_lock_interruptible(&u->bindlock);
1227         if (err)
1228                 goto out_unlink;
1229         if (u->addr)
1230                 goto out_unlock;
1231
1232         old_hash = sk->sk_hash;
1233         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1234         unix_table_double_lock(net, old_hash, new_hash);
1235         u->path.mnt = mntget(parent.mnt);
1236         u->path.dentry = dget(dentry);
1237         __unix_set_addr_hash(net, sk, addr, new_hash);
1238         unix_table_double_unlock(net, old_hash, new_hash);
1239         unix_insert_bsd_socket(sk);
1240         mutex_unlock(&u->bindlock);
1241         done_path_create(&parent, dentry);
1242         return 0;
1243
1244 out_unlock:
1245         mutex_unlock(&u->bindlock);
1246         err = -EINVAL;
1247 out_unlink:
1248         /* failed after successful mknod?  unlink what we'd created... */
1249         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1250 out_path:
1251         done_path_create(&parent, dentry);
1252 out:
1253         unix_release_addr(addr);
1254         return err == -EEXIST ? -EADDRINUSE : err;
1255 }
1256
1257 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1258                               int addr_len)
1259 {
1260         struct unix_sock *u = unix_sk(sk);
1261         unsigned int new_hash, old_hash;
1262         struct net *net = sock_net(sk);
1263         struct unix_address *addr;
1264         int err;
1265
1266         addr = unix_create_addr(sunaddr, addr_len);
1267         if (!addr)
1268                 return -ENOMEM;
1269
1270         err = mutex_lock_interruptible(&u->bindlock);
1271         if (err)
1272                 goto out;
1273
1274         if (u->addr) {
1275                 err = -EINVAL;
1276                 goto out_mutex;
1277         }
1278
1279         old_hash = sk->sk_hash;
1280         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1281         unix_table_double_lock(net, old_hash, new_hash);
1282
1283         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1284                 goto out_spin;
1285
1286         __unix_set_addr_hash(net, sk, addr, new_hash);
1287         unix_table_double_unlock(net, old_hash, new_hash);
1288         mutex_unlock(&u->bindlock);
1289         return 0;
1290
1291 out_spin:
1292         unix_table_double_unlock(net, old_hash, new_hash);
1293         err = -EADDRINUSE;
1294 out_mutex:
1295         mutex_unlock(&u->bindlock);
1296 out:
1297         unix_release_addr(addr);
1298         return err;
1299 }
1300
1301 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1302 {
1303         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1304         struct sock *sk = sock->sk;
1305         int err;
1306
1307         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1308             sunaddr->sun_family == AF_UNIX)
1309                 return unix_autobind(sk);
1310
1311         err = unix_validate_addr(sunaddr, addr_len);
1312         if (err)
1313                 return err;
1314
1315         if (sunaddr->sun_path[0])
1316                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1317         else
1318                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1319
1320         return err;
1321 }
1322
1323 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1324 {
1325         if (unlikely(sk1 == sk2) || !sk2) {
1326                 unix_state_lock(sk1);
1327                 return;
1328         }
1329         if (sk1 > sk2)
1330                 swap(sk1, sk2);
1331
1332         unix_state_lock(sk1);
1333         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1334 }
1335
1336 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1337 {
1338         if (unlikely(sk1 == sk2) || !sk2) {
1339                 unix_state_unlock(sk1);
1340                 return;
1341         }
1342         unix_state_unlock(sk1);
1343         unix_state_unlock(sk2);
1344 }
1345
1346 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1347                               int alen, int flags)
1348 {
1349         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1350         struct sock *sk = sock->sk;
1351         struct sock *other;
1352         int err;
1353
1354         err = -EINVAL;
1355         if (alen < offsetofend(struct sockaddr, sa_family))
1356                 goto out;
1357
1358         if (addr->sa_family != AF_UNSPEC) {
1359                 err = unix_validate_addr(sunaddr, alen);
1360                 if (err)
1361                         goto out;
1362
1363                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1364                 if (err)
1365                         goto out;
1366
1367                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1368                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1369                     !READ_ONCE(unix_sk(sk)->addr)) {
1370                         err = unix_autobind(sk);
1371                         if (err)
1372                                 goto out;
1373                 }
1374
1375 restart:
1376                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1377                 if (IS_ERR(other)) {
1378                         err = PTR_ERR(other);
1379                         goto out;
1380                 }
1381
1382                 unix_state_double_lock(sk, other);
1383
1384                 /* Apparently VFS overslept socket death. Retry. */
1385                 if (sock_flag(other, SOCK_DEAD)) {
1386                         unix_state_double_unlock(sk, other);
1387                         sock_put(other);
1388                         goto restart;
1389                 }
1390
1391                 err = -EPERM;
1392                 if (!unix_may_send(sk, other))
1393                         goto out_unlock;
1394
1395                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1396                 if (err)
1397                         goto out_unlock;
1398
1399                 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1400                 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1401         } else {
1402                 /*
1403                  *      1003.1g breaking connected state with AF_UNSPEC
1404                  */
1405                 other = NULL;
1406                 unix_state_double_lock(sk, other);
1407         }
1408
1409         /*
1410          * If it was connected, reconnect.
1411          */
1412         if (unix_peer(sk)) {
1413                 struct sock *old_peer = unix_peer(sk);
1414
1415                 unix_peer(sk) = other;
1416                 if (!other)
1417                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1418                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1419
1420                 unix_state_double_unlock(sk, other);
1421
1422                 if (other != old_peer) {
1423                         unix_dgram_disconnected(sk, old_peer);
1424
1425                         unix_state_lock(old_peer);
1426                         if (!unix_peer(old_peer))
1427                                 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1428                         unix_state_unlock(old_peer);
1429                 }
1430
1431                 sock_put(old_peer);
1432         } else {
1433                 unix_peer(sk) = other;
1434                 unix_state_double_unlock(sk, other);
1435         }
1436
1437         return 0;
1438
1439 out_unlock:
1440         unix_state_double_unlock(sk, other);
1441         sock_put(other);
1442 out:
1443         return err;
1444 }
1445
1446 static long unix_wait_for_peer(struct sock *other, long timeo)
1447         __releases(&unix_sk(other)->lock)
1448 {
1449         struct unix_sock *u = unix_sk(other);
1450         int sched;
1451         DEFINE_WAIT(wait);
1452
1453         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1454
1455         sched = !sock_flag(other, SOCK_DEAD) &&
1456                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1457                 unix_recvq_full_lockless(other);
1458
1459         unix_state_unlock(other);
1460
1461         if (sched)
1462                 timeo = schedule_timeout(timeo);
1463
1464         finish_wait(&u->peer_wait, &wait);
1465         return timeo;
1466 }
1467
1468 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1469                                int addr_len, int flags)
1470 {
1471         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1472         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1473         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1474         struct net *net = sock_net(sk);
1475         struct sk_buff *skb = NULL;
1476         long timeo;
1477         int err;
1478
1479         err = unix_validate_addr(sunaddr, addr_len);
1480         if (err)
1481                 goto out;
1482
1483         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1484         if (err)
1485                 goto out;
1486
1487         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1488              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1489             !READ_ONCE(u->addr)) {
1490                 err = unix_autobind(sk);
1491                 if (err)
1492                         goto out;
1493         }
1494
1495         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1496
1497         /* First of all allocate resources.
1498            If we will make it after state is locked,
1499            we will have to recheck all again in any case.
1500          */
1501
1502         /* create new sock for complete connection */
1503         newsk = unix_create1(net, NULL, 0, sock->type);
1504         if (IS_ERR(newsk)) {
1505                 err = PTR_ERR(newsk);
1506                 newsk = NULL;
1507                 goto out;
1508         }
1509
1510         err = -ENOMEM;
1511
1512         /* Allocate skb for sending to listening sock */
1513         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1514         if (skb == NULL)
1515                 goto out;
1516
1517 restart:
1518         /*  Find listening sock. */
1519         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1520         if (IS_ERR(other)) {
1521                 err = PTR_ERR(other);
1522                 other = NULL;
1523                 goto out;
1524         }
1525
1526         /* Latch state of peer */
1527         unix_state_lock(other);
1528
1529         /* Apparently VFS overslept socket death. Retry. */
1530         if (sock_flag(other, SOCK_DEAD)) {
1531                 unix_state_unlock(other);
1532                 sock_put(other);
1533                 goto restart;
1534         }
1535
1536         err = -ECONNREFUSED;
1537         if (other->sk_state != TCP_LISTEN)
1538                 goto out_unlock;
1539         if (other->sk_shutdown & RCV_SHUTDOWN)
1540                 goto out_unlock;
1541
1542         if (unix_recvq_full_lockless(other)) {
1543                 err = -EAGAIN;
1544                 if (!timeo)
1545                         goto out_unlock;
1546
1547                 timeo = unix_wait_for_peer(other, timeo);
1548
1549                 err = sock_intr_errno(timeo);
1550                 if (signal_pending(current))
1551                         goto out;
1552                 sock_put(other);
1553                 goto restart;
1554         }
1555
1556         /* Latch our state.
1557
1558            It is tricky place. We need to grab our state lock and cannot
1559            drop lock on peer. It is dangerous because deadlock is
1560            possible. Connect to self case and simultaneous
1561            attempt to connect are eliminated by checking socket
1562            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1563            check this before attempt to grab lock.
1564
1565            Well, and we have to recheck the state after socket locked.
1566          */
1567         switch (READ_ONCE(sk->sk_state)) {
1568         case TCP_CLOSE:
1569                 /* This is ok... continue with connect */
1570                 break;
1571         case TCP_ESTABLISHED:
1572                 /* Socket is already connected */
1573                 err = -EISCONN;
1574                 goto out_unlock;
1575         default:
1576                 err = -EINVAL;
1577                 goto out_unlock;
1578         }
1579
1580         unix_state_lock_nested(sk, U_LOCK_SECOND);
1581
1582         if (sk->sk_state != TCP_CLOSE) {
1583                 unix_state_unlock(sk);
1584                 unix_state_unlock(other);
1585                 sock_put(other);
1586                 goto restart;
1587         }
1588
1589         err = security_unix_stream_connect(sk, other, newsk);
1590         if (err) {
1591                 unix_state_unlock(sk);
1592                 goto out_unlock;
1593         }
1594
1595         /* The way is open! Fastly set all the necessary fields... */
1596
1597         sock_hold(sk);
1598         unix_peer(newsk)        = sk;
1599         newsk->sk_state         = TCP_ESTABLISHED;
1600         newsk->sk_type          = sk->sk_type;
1601         init_peercred(newsk);
1602         newu = unix_sk(newsk);
1603         newu->listener = other;
1604         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1605         otheru = unix_sk(other);
1606
1607         /* copy address information from listening to new sock
1608          *
1609          * The contents of *(otheru->addr) and otheru->path
1610          * are seen fully set up here, since we have found
1611          * otheru in hash under its lock.  Insertion into the
1612          * hash chain we'd found it in had been done in an
1613          * earlier critical area protected by the chain's lock,
1614          * the same one where we'd set *(otheru->addr) contents,
1615          * as well as otheru->path and otheru->addr itself.
1616          *
1617          * Using smp_store_release() here to set newu->addr
1618          * is enough to make those stores, as well as stores
1619          * to newu->path visible to anyone who gets newu->addr
1620          * by smp_load_acquire().  IOW, the same warranties
1621          * as for unix_sock instances bound in unix_bind() or
1622          * in unix_autobind().
1623          */
1624         if (otheru->path.dentry) {
1625                 path_get(&otheru->path);
1626                 newu->path = otheru->path;
1627         }
1628         refcount_inc(&otheru->addr->refcnt);
1629         smp_store_release(&newu->addr, otheru->addr);
1630
1631         /* Set credentials */
1632         copy_peercred(sk, other);
1633
1634         sock->state     = SS_CONNECTED;
1635         WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1636         sock_hold(newsk);
1637
1638         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1639         unix_peer(sk)   = newsk;
1640
1641         unix_state_unlock(sk);
1642
1643         /* take ten and send info to listening sock */
1644         spin_lock(&other->sk_receive_queue.lock);
1645         __skb_queue_tail(&other->sk_receive_queue, skb);
1646         spin_unlock(&other->sk_receive_queue.lock);
1647         unix_state_unlock(other);
1648         other->sk_data_ready(other);
1649         sock_put(other);
1650         return 0;
1651
1652 out_unlock:
1653         if (other)
1654                 unix_state_unlock(other);
1655
1656 out:
1657         kfree_skb(skb);
1658         if (newsk)
1659                 unix_release_sock(newsk, 0);
1660         if (other)
1661                 sock_put(other);
1662         return err;
1663 }
1664
1665 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1666 {
1667         struct sock *ska = socka->sk, *skb = sockb->sk;
1668
1669         /* Join our sockets back to back */
1670         sock_hold(ska);
1671         sock_hold(skb);
1672         unix_peer(ska) = skb;
1673         unix_peer(skb) = ska;
1674         init_peercred(ska);
1675         init_peercred(skb);
1676
1677         ska->sk_state = TCP_ESTABLISHED;
1678         skb->sk_state = TCP_ESTABLISHED;
1679         socka->state  = SS_CONNECTED;
1680         sockb->state  = SS_CONNECTED;
1681         return 0;
1682 }
1683
1684 static void unix_sock_inherit_flags(const struct socket *old,
1685                                     struct socket *new)
1686 {
1687         if (test_bit(SOCK_PASSCRED, &old->flags))
1688                 set_bit(SOCK_PASSCRED, &new->flags);
1689         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1690                 set_bit(SOCK_PASSPIDFD, &new->flags);
1691         if (test_bit(SOCK_PASSSEC, &old->flags))
1692                 set_bit(SOCK_PASSSEC, &new->flags);
1693 }
1694
1695 static int unix_accept(struct socket *sock, struct socket *newsock,
1696                        struct proto_accept_arg *arg)
1697 {
1698         struct sock *sk = sock->sk;
1699         struct sk_buff *skb;
1700         struct sock *tsk;
1701
1702         arg->err = -EOPNOTSUPP;
1703         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1704                 goto out;
1705
1706         arg->err = -EINVAL;
1707         if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1708                 goto out;
1709
1710         /* If socket state is TCP_LISTEN it cannot change (for now...),
1711          * so that no locks are necessary.
1712          */
1713
1714         skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1715                                 &arg->err);
1716         if (!skb) {
1717                 /* This means receive shutdown. */
1718                 if (arg->err == 0)
1719                         arg->err = -EINVAL;
1720                 goto out;
1721         }
1722
1723         tsk = skb->sk;
1724         skb_free_datagram(sk, skb);
1725         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1726
1727         /* attach accepted sock to socket */
1728         unix_state_lock(tsk);
1729         unix_update_edges(unix_sk(tsk));
1730         newsock->state = SS_CONNECTED;
1731         unix_sock_inherit_flags(sock, newsock);
1732         sock_graft(tsk, newsock);
1733         unix_state_unlock(tsk);
1734         return 0;
1735
1736 out:
1737         return arg->err;
1738 }
1739
1740
1741 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1742 {
1743         struct sock *sk = sock->sk;
1744         struct unix_address *addr;
1745         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1746         int err = 0;
1747
1748         if (peer) {
1749                 sk = unix_peer_get(sk);
1750
1751                 err = -ENOTCONN;
1752                 if (!sk)
1753                         goto out;
1754                 err = 0;
1755         } else {
1756                 sock_hold(sk);
1757         }
1758
1759         addr = smp_load_acquire(&unix_sk(sk)->addr);
1760         if (!addr) {
1761                 sunaddr->sun_family = AF_UNIX;
1762                 sunaddr->sun_path[0] = 0;
1763                 err = offsetof(struct sockaddr_un, sun_path);
1764         } else {
1765                 err = addr->len;
1766                 memcpy(sunaddr, addr->name, addr->len);
1767
1768                 if (peer)
1769                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1770                                                CGROUP_UNIX_GETPEERNAME);
1771                 else
1772                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1773                                                CGROUP_UNIX_GETSOCKNAME);
1774         }
1775         sock_put(sk);
1776 out:
1777         return err;
1778 }
1779
1780 /* The "user->unix_inflight" variable is protected by the garbage
1781  * collection lock, and we just read it locklessly here. If you go
1782  * over the limit, there might be a tiny race in actually noticing
1783  * it across threads. Tough.
1784  */
1785 static inline bool too_many_unix_fds(struct task_struct *p)
1786 {
1787         struct user_struct *user = current_user();
1788
1789         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1790                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1791         return false;
1792 }
1793
1794 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1795 {
1796         if (too_many_unix_fds(current))
1797                 return -ETOOMANYREFS;
1798
1799         UNIXCB(skb).fp = scm->fp;
1800         scm->fp = NULL;
1801
1802         if (unix_prepare_fpl(UNIXCB(skb).fp))
1803                 return -ENOMEM;
1804
1805         return 0;
1806 }
1807
1808 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1809 {
1810         scm->fp = UNIXCB(skb).fp;
1811         UNIXCB(skb).fp = NULL;
1812
1813         unix_destroy_fpl(scm->fp);
1814 }
1815
1816 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1817 {
1818         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1819 }
1820
1821 static void unix_destruct_scm(struct sk_buff *skb)
1822 {
1823         struct scm_cookie scm;
1824
1825         memset(&scm, 0, sizeof(scm));
1826         scm.pid  = UNIXCB(skb).pid;
1827         if (UNIXCB(skb).fp)
1828                 unix_detach_fds(&scm, skb);
1829
1830         /* Alas, it calls VFS */
1831         /* So fscking what? fput() had been SMP-safe since the last Summer */
1832         scm_destroy(&scm);
1833         sock_wfree(skb);
1834 }
1835
1836 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1837 {
1838         int err = 0;
1839
1840         UNIXCB(skb).pid  = get_pid(scm->pid);
1841         UNIXCB(skb).uid = scm->creds.uid;
1842         UNIXCB(skb).gid = scm->creds.gid;
1843         UNIXCB(skb).fp = NULL;
1844         unix_get_secdata(scm, skb);
1845         if (scm->fp && send_fds)
1846                 err = unix_attach_fds(scm, skb);
1847
1848         skb->destructor = unix_destruct_scm;
1849         return err;
1850 }
1851
1852 static bool unix_passcred_enabled(const struct socket *sock,
1853                                   const struct sock *other)
1854 {
1855         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1856                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1857                !other->sk_socket ||
1858                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1859                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1860 }
1861
1862 /*
1863  * Some apps rely on write() giving SCM_CREDENTIALS
1864  * We include credentials if source or destination socket
1865  * asserted SOCK_PASSCRED.
1866  */
1867 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1868                             const struct sock *other)
1869 {
1870         if (UNIXCB(skb).pid)
1871                 return;
1872         if (unix_passcred_enabled(sock, other)) {
1873                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1874                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1875         }
1876 }
1877
1878 static bool unix_skb_scm_eq(struct sk_buff *skb,
1879                             struct scm_cookie *scm)
1880 {
1881         return UNIXCB(skb).pid == scm->pid &&
1882                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1883                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1884                unix_secdata_eq(scm, skb);
1885 }
1886
1887 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1888 {
1889         struct scm_fp_list *fp = UNIXCB(skb).fp;
1890         struct unix_sock *u = unix_sk(sk);
1891
1892         if (unlikely(fp && fp->count)) {
1893                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1894                 unix_add_edges(fp, u);
1895         }
1896 }
1897
1898 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1899 {
1900         struct scm_fp_list *fp = UNIXCB(skb).fp;
1901         struct unix_sock *u = unix_sk(sk);
1902
1903         if (unlikely(fp && fp->count)) {
1904                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1905                 unix_del_edges(fp);
1906         }
1907 }
1908
1909 /*
1910  *      Send AF_UNIX data.
1911  */
1912
1913 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1914                               size_t len)
1915 {
1916         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1917         struct sock *sk = sock->sk, *other = NULL;
1918         struct unix_sock *u = unix_sk(sk);
1919         struct scm_cookie scm;
1920         struct sk_buff *skb;
1921         int data_len = 0;
1922         int sk_locked;
1923         long timeo;
1924         int err;
1925
1926         err = scm_send(sock, msg, &scm, false);
1927         if (err < 0)
1928                 return err;
1929
1930         wait_for_unix_gc(scm.fp);
1931
1932         err = -EOPNOTSUPP;
1933         if (msg->msg_flags&MSG_OOB)
1934                 goto out;
1935
1936         if (msg->msg_namelen) {
1937                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1938                 if (err)
1939                         goto out;
1940
1941                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1942                                                             msg->msg_name,
1943                                                             &msg->msg_namelen,
1944                                                             NULL);
1945                 if (err)
1946                         goto out;
1947         } else {
1948                 sunaddr = NULL;
1949                 err = -ENOTCONN;
1950                 other = unix_peer_get(sk);
1951                 if (!other)
1952                         goto out;
1953         }
1954
1955         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1956              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1957             !READ_ONCE(u->addr)) {
1958                 err = unix_autobind(sk);
1959                 if (err)
1960                         goto out;
1961         }
1962
1963         err = -EMSGSIZE;
1964         if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1965                 goto out;
1966
1967         if (len > SKB_MAX_ALLOC) {
1968                 data_len = min_t(size_t,
1969                                  len - SKB_MAX_ALLOC,
1970                                  MAX_SKB_FRAGS * PAGE_SIZE);
1971                 data_len = PAGE_ALIGN(data_len);
1972
1973                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974         }
1975
1976         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977                                    msg->msg_flags & MSG_DONTWAIT, &err,
1978                                    PAGE_ALLOC_COSTLY_ORDER);
1979         if (skb == NULL)
1980                 goto out;
1981
1982         err = unix_scm_to_skb(&scm, skb, true);
1983         if (err < 0)
1984                 goto out_free;
1985
1986         skb_put(skb, len - data_len);
1987         skb->data_len = data_len;
1988         skb->len = len;
1989         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990         if (err)
1991                 goto out_free;
1992
1993         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994
1995 restart:
1996         if (!other) {
1997                 err = -ECONNRESET;
1998                 if (sunaddr == NULL)
1999                         goto out_free;
2000
2001                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002                                         sk->sk_type);
2003                 if (IS_ERR(other)) {
2004                         err = PTR_ERR(other);
2005                         other = NULL;
2006                         goto out_free;
2007                 }
2008         }
2009
2010         if (sk_filter(other, skb) < 0) {
2011                 /* Toss the packet but do not return any error to the sender */
2012                 err = len;
2013                 goto out_free;
2014         }
2015
2016         sk_locked = 0;
2017         unix_state_lock(other);
2018 restart_locked:
2019         err = -EPERM;
2020         if (!unix_may_send(sk, other))
2021                 goto out_unlock;
2022
2023         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024                 /*
2025                  *      Check with 1003.1g - what should
2026                  *      datagram error
2027                  */
2028                 unix_state_unlock(other);
2029                 sock_put(other);
2030
2031                 if (!sk_locked)
2032                         unix_state_lock(sk);
2033
2034                 err = 0;
2035                 if (sk->sk_type == SOCK_SEQPACKET) {
2036                         /* We are here only when racing with unix_release_sock()
2037                          * is clearing @other. Never change state to TCP_CLOSE
2038                          * unlike SOCK_DGRAM wants.
2039                          */
2040                         unix_state_unlock(sk);
2041                         err = -EPIPE;
2042                 } else if (unix_peer(sk) == other) {
2043                         unix_peer(sk) = NULL;
2044                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045
2046                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2047                         unix_state_unlock(sk);
2048
2049                         unix_dgram_disconnected(sk, other);
2050                         sock_put(other);
2051                         err = -ECONNREFUSED;
2052                 } else {
2053                         unix_state_unlock(sk);
2054                 }
2055
2056                 other = NULL;
2057                 if (err)
2058                         goto out_free;
2059                 goto restart;
2060         }
2061
2062         err = -EPIPE;
2063         if (other->sk_shutdown & RCV_SHUTDOWN)
2064                 goto out_unlock;
2065
2066         if (sk->sk_type != SOCK_SEQPACKET) {
2067                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068                 if (err)
2069                         goto out_unlock;
2070         }
2071
2072         /* other == sk && unix_peer(other) != sk if
2073          * - unix_peer(sk) == NULL, destination address bound to sk
2074          * - unix_peer(sk) == sk by time of get but disconnected before lock
2075          */
2076         if (other != sk &&
2077             unlikely(unix_peer(other) != sk &&
2078             unix_recvq_full_lockless(other))) {
2079                 if (timeo) {
2080                         timeo = unix_wait_for_peer(other, timeo);
2081
2082                         err = sock_intr_errno(timeo);
2083                         if (signal_pending(current))
2084                                 goto out_free;
2085
2086                         goto restart;
2087                 }
2088
2089                 if (!sk_locked) {
2090                         unix_state_unlock(other);
2091                         unix_state_double_lock(sk, other);
2092                 }
2093
2094                 if (unix_peer(sk) != other ||
2095                     unix_dgram_peer_wake_me(sk, other)) {
2096                         err = -EAGAIN;
2097                         sk_locked = 1;
2098                         goto out_unlock;
2099                 }
2100
2101                 if (!sk_locked) {
2102                         sk_locked = 1;
2103                         goto restart_locked;
2104                 }
2105         }
2106
2107         if (unlikely(sk_locked))
2108                 unix_state_unlock(sk);
2109
2110         if (sock_flag(other, SOCK_RCVTSTAMP))
2111                 __net_timestamp(skb);
2112         maybe_add_creds(skb, sock, other);
2113         scm_stat_add(other, skb);
2114         skb_queue_tail(&other->sk_receive_queue, skb);
2115         unix_state_unlock(other);
2116         other->sk_data_ready(other);
2117         sock_put(other);
2118         scm_destroy(&scm);
2119         return len;
2120
2121 out_unlock:
2122         if (sk_locked)
2123                 unix_state_unlock(sk);
2124         unix_state_unlock(other);
2125 out_free:
2126         kfree_skb(skb);
2127 out:
2128         if (other)
2129                 sock_put(other);
2130         scm_destroy(&scm);
2131         return err;
2132 }
2133
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141                      struct scm_cookie *scm, bool fds_sent)
2142 {
2143         struct unix_sock *ousk = unix_sk(other);
2144         struct sk_buff *skb;
2145         int err = 0;
2146
2147         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148
2149         if (!skb)
2150                 return err;
2151
2152         err = unix_scm_to_skb(scm, skb, !fds_sent);
2153         if (err < 0) {
2154                 kfree_skb(skb);
2155                 return err;
2156         }
2157         skb_put(skb, 1);
2158         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159
2160         if (err) {
2161                 kfree_skb(skb);
2162                 return err;
2163         }
2164
2165         unix_state_lock(other);
2166
2167         if (sock_flag(other, SOCK_DEAD) ||
2168             (other->sk_shutdown & RCV_SHUTDOWN)) {
2169                 unix_state_unlock(other);
2170                 kfree_skb(skb);
2171                 return -EPIPE;
2172         }
2173
2174         maybe_add_creds(skb, sock, other);
2175         skb_get(skb);
2176
2177         scm_stat_add(other, skb);
2178
2179         spin_lock(&other->sk_receive_queue.lock);
2180         if (ousk->oob_skb)
2181                 consume_skb(ousk->oob_skb);
2182         WRITE_ONCE(ousk->oob_skb, skb);
2183         __skb_queue_tail(&other->sk_receive_queue, skb);
2184         spin_unlock(&other->sk_receive_queue.lock);
2185
2186         sk_send_sigurg(other);
2187         unix_state_unlock(other);
2188         other->sk_data_ready(other);
2189
2190         return err;
2191 }
2192 #endif
2193
2194 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2195                                size_t len)
2196 {
2197         struct sock *sk = sock->sk;
2198         struct sock *other = NULL;
2199         int err, size;
2200         struct sk_buff *skb;
2201         int sent = 0;
2202         struct scm_cookie scm;
2203         bool fds_sent = false;
2204         int data_len;
2205
2206         err = scm_send(sock, msg, &scm, false);
2207         if (err < 0)
2208                 return err;
2209
2210         wait_for_unix_gc(scm.fp);
2211
2212         err = -EOPNOTSUPP;
2213         if (msg->msg_flags & MSG_OOB) {
2214 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2215                 if (len)
2216                         len--;
2217                 else
2218 #endif
2219                         goto out_err;
2220         }
2221
2222         if (msg->msg_namelen) {
2223                 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2224                 goto out_err;
2225         } else {
2226                 err = -ENOTCONN;
2227                 other = unix_peer(sk);
2228                 if (!other)
2229                         goto out_err;
2230         }
2231
2232         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2233                 goto pipe_err;
2234
2235         while (sent < len) {
2236                 size = len - sent;
2237
2238                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2239                         skb = sock_alloc_send_pskb(sk, 0, 0,
2240                                                    msg->msg_flags & MSG_DONTWAIT,
2241                                                    &err, 0);
2242                 } else {
2243                         /* Keep two messages in the pipe so it schedules better */
2244                         size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2245
2246                         /* allow fallback to order-0 allocations */
2247                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2248
2249                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2250
2251                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2252
2253                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2254                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2255                                                    get_order(UNIX_SKB_FRAGS_SZ));
2256                 }
2257                 if (!skb)
2258                         goto out_err;
2259
2260                 /* Only send the fds in the first buffer */
2261                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2262                 if (err < 0) {
2263                         kfree_skb(skb);
2264                         goto out_err;
2265                 }
2266                 fds_sent = true;
2267
2268                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2269                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2270                                                    sk->sk_allocation);
2271                         if (err < 0) {
2272                                 kfree_skb(skb);
2273                                 goto out_err;
2274                         }
2275                         size = err;
2276                         refcount_add(size, &sk->sk_wmem_alloc);
2277                 } else {
2278                         skb_put(skb, size - data_len);
2279                         skb->data_len = data_len;
2280                         skb->len = size;
2281                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2282                         if (err) {
2283                                 kfree_skb(skb);
2284                                 goto out_err;
2285                         }
2286                 }
2287
2288                 unix_state_lock(other);
2289
2290                 if (sock_flag(other, SOCK_DEAD) ||
2291                     (other->sk_shutdown & RCV_SHUTDOWN))
2292                         goto pipe_err_free;
2293
2294                 maybe_add_creds(skb, sock, other);
2295                 scm_stat_add(other, skb);
2296                 skb_queue_tail(&other->sk_receive_queue, skb);
2297                 unix_state_unlock(other);
2298                 other->sk_data_ready(other);
2299                 sent += size;
2300         }
2301
2302 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2303         if (msg->msg_flags & MSG_OOB) {
2304                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2305                 if (err)
2306                         goto out_err;
2307                 sent++;
2308         }
2309 #endif
2310
2311         scm_destroy(&scm);
2312
2313         return sent;
2314
2315 pipe_err_free:
2316         unix_state_unlock(other);
2317         kfree_skb(skb);
2318 pipe_err:
2319         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2320                 send_sig(SIGPIPE, current, 0);
2321         err = -EPIPE;
2322 out_err:
2323         scm_destroy(&scm);
2324         return sent ? : err;
2325 }
2326
2327 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2328                                   size_t len)
2329 {
2330         int err;
2331         struct sock *sk = sock->sk;
2332
2333         err = sock_error(sk);
2334         if (err)
2335                 return err;
2336
2337         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2338                 return -ENOTCONN;
2339
2340         if (msg->msg_namelen)
2341                 msg->msg_namelen = 0;
2342
2343         return unix_dgram_sendmsg(sock, msg, len);
2344 }
2345
2346 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2347                                   size_t size, int flags)
2348 {
2349         struct sock *sk = sock->sk;
2350
2351         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2352                 return -ENOTCONN;
2353
2354         return unix_dgram_recvmsg(sock, msg, size, flags);
2355 }
2356
2357 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2358 {
2359         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2360
2361         if (addr) {
2362                 msg->msg_namelen = addr->len;
2363                 memcpy(msg->msg_name, addr->name, addr->len);
2364         }
2365 }
2366
2367 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2368                          int flags)
2369 {
2370         struct scm_cookie scm;
2371         struct socket *sock = sk->sk_socket;
2372         struct unix_sock *u = unix_sk(sk);
2373         struct sk_buff *skb, *last;
2374         long timeo;
2375         int skip;
2376         int err;
2377
2378         err = -EOPNOTSUPP;
2379         if (flags&MSG_OOB)
2380                 goto out;
2381
2382         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2383
2384         do {
2385                 mutex_lock(&u->iolock);
2386
2387                 skip = sk_peek_offset(sk, flags);
2388                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2389                                               &skip, &err, &last);
2390                 if (skb) {
2391                         if (!(flags & MSG_PEEK))
2392                                 scm_stat_del(sk, skb);
2393                         break;
2394                 }
2395
2396                 mutex_unlock(&u->iolock);
2397
2398                 if (err != -EAGAIN)
2399                         break;
2400         } while (timeo &&
2401                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2402                                               &err, &timeo, last));
2403
2404         if (!skb) { /* implies iolock unlocked */
2405                 unix_state_lock(sk);
2406                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2407                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2408                     (sk->sk_shutdown & RCV_SHUTDOWN))
2409                         err = 0;
2410                 unix_state_unlock(sk);
2411                 goto out;
2412         }
2413
2414         if (wq_has_sleeper(&u->peer_wait))
2415                 wake_up_interruptible_sync_poll(&u->peer_wait,
2416                                                 EPOLLOUT | EPOLLWRNORM |
2417                                                 EPOLLWRBAND);
2418
2419         if (msg->msg_name) {
2420                 unix_copy_addr(msg, skb->sk);
2421
2422                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2423                                                       msg->msg_name,
2424                                                       &msg->msg_namelen);
2425         }
2426
2427         if (size > skb->len - skip)
2428                 size = skb->len - skip;
2429         else if (size < skb->len - skip)
2430                 msg->msg_flags |= MSG_TRUNC;
2431
2432         err = skb_copy_datagram_msg(skb, skip, msg, size);
2433         if (err)
2434                 goto out_free;
2435
2436         if (sock_flag(sk, SOCK_RCVTSTAMP))
2437                 __sock_recv_timestamp(msg, sk, skb);
2438
2439         memset(&scm, 0, sizeof(scm));
2440
2441         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2442         unix_set_secdata(&scm, skb);
2443
2444         if (!(flags & MSG_PEEK)) {
2445                 if (UNIXCB(skb).fp)
2446                         unix_detach_fds(&scm, skb);
2447
2448                 sk_peek_offset_bwd(sk, skb->len);
2449         } else {
2450                 /* It is questionable: on PEEK we could:
2451                    - do not return fds - good, but too simple 8)
2452                    - return fds, and do not return them on read (old strategy,
2453                      apparently wrong)
2454                    - clone fds (I chose it for now, it is the most universal
2455                      solution)
2456
2457                    POSIX 1003.1g does not actually define this clearly
2458                    at all. POSIX 1003.1g doesn't define a lot of things
2459                    clearly however!
2460
2461                 */
2462
2463                 sk_peek_offset_fwd(sk, size);
2464
2465                 if (UNIXCB(skb).fp)
2466                         unix_peek_fds(&scm, skb);
2467         }
2468         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2469
2470         scm_recv_unix(sock, msg, &scm, flags);
2471
2472 out_free:
2473         skb_free_datagram(sk, skb);
2474         mutex_unlock(&u->iolock);
2475 out:
2476         return err;
2477 }
2478
2479 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2480                               int flags)
2481 {
2482         struct sock *sk = sock->sk;
2483
2484 #ifdef CONFIG_BPF_SYSCALL
2485         const struct proto *prot = READ_ONCE(sk->sk_prot);
2486
2487         if (prot != &unix_dgram_proto)
2488                 return prot->recvmsg(sk, msg, size, flags, NULL);
2489 #endif
2490         return __unix_dgram_recvmsg(sk, msg, size, flags);
2491 }
2492
2493 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2494 {
2495         struct unix_sock *u = unix_sk(sk);
2496         struct sk_buff *skb;
2497         int err;
2498
2499         mutex_lock(&u->iolock);
2500         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2501         mutex_unlock(&u->iolock);
2502         if (!skb)
2503                 return err;
2504
2505         return recv_actor(sk, skb);
2506 }
2507
2508 /*
2509  *      Sleep until more data has arrived. But check for races..
2510  */
2511 static long unix_stream_data_wait(struct sock *sk, long timeo,
2512                                   struct sk_buff *last, unsigned int last_len,
2513                                   bool freezable)
2514 {
2515         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2516         struct sk_buff *tail;
2517         DEFINE_WAIT(wait);
2518
2519         unix_state_lock(sk);
2520
2521         for (;;) {
2522                 prepare_to_wait(sk_sleep(sk), &wait, state);
2523
2524                 tail = skb_peek_tail(&sk->sk_receive_queue);
2525                 if (tail != last ||
2526                     (tail && tail->len != last_len) ||
2527                     sk->sk_err ||
2528                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2529                     signal_pending(current) ||
2530                     !timeo)
2531                         break;
2532
2533                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2534                 unix_state_unlock(sk);
2535                 timeo = schedule_timeout(timeo);
2536                 unix_state_lock(sk);
2537
2538                 if (sock_flag(sk, SOCK_DEAD))
2539                         break;
2540
2541                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2542         }
2543
2544         finish_wait(sk_sleep(sk), &wait);
2545         unix_state_unlock(sk);
2546         return timeo;
2547 }
2548
2549 static unsigned int unix_skb_len(const struct sk_buff *skb)
2550 {
2551         return skb->len - UNIXCB(skb).consumed;
2552 }
2553
2554 struct unix_stream_read_state {
2555         int (*recv_actor)(struct sk_buff *, int, int,
2556                           struct unix_stream_read_state *);
2557         struct socket *socket;
2558         struct msghdr *msg;
2559         struct pipe_inode_info *pipe;
2560         size_t size;
2561         int flags;
2562         unsigned int splice_flags;
2563 };
2564
2565 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2566 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2567 {
2568         struct socket *sock = state->socket;
2569         struct sock *sk = sock->sk;
2570         struct unix_sock *u = unix_sk(sk);
2571         int chunk = 1;
2572         struct sk_buff *oob_skb;
2573
2574         mutex_lock(&u->iolock);
2575         unix_state_lock(sk);
2576         spin_lock(&sk->sk_receive_queue.lock);
2577
2578         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2579                 spin_unlock(&sk->sk_receive_queue.lock);
2580                 unix_state_unlock(sk);
2581                 mutex_unlock(&u->iolock);
2582                 return -EINVAL;
2583         }
2584
2585         oob_skb = u->oob_skb;
2586
2587         if (!(state->flags & MSG_PEEK))
2588                 WRITE_ONCE(u->oob_skb, NULL);
2589         else
2590                 skb_get(oob_skb);
2591
2592         spin_unlock(&sk->sk_receive_queue.lock);
2593         unix_state_unlock(sk);
2594
2595         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2596
2597         if (!(state->flags & MSG_PEEK))
2598                 UNIXCB(oob_skb).consumed += 1;
2599
2600         consume_skb(oob_skb);
2601
2602         mutex_unlock(&u->iolock);
2603
2604         if (chunk < 0)
2605                 return -EFAULT;
2606
2607         state->msg->msg_flags |= MSG_OOB;
2608         return 1;
2609 }
2610
2611 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2612                                   int flags, int copied)
2613 {
2614         struct unix_sock *u = unix_sk(sk);
2615
2616         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2617                 skb_unlink(skb, &sk->sk_receive_queue);
2618                 consume_skb(skb);
2619                 skb = NULL;
2620         } else {
2621                 struct sk_buff *unlinked_skb = NULL;
2622
2623                 spin_lock(&sk->sk_receive_queue.lock);
2624
2625                 if (skb == u->oob_skb) {
2626                         if (copied) {
2627                                 skb = NULL;
2628                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2629                                 if (!(flags & MSG_PEEK)) {
2630                                         WRITE_ONCE(u->oob_skb, NULL);
2631                                         consume_skb(skb);
2632                                 }
2633                         } else if (flags & MSG_PEEK) {
2634                                 skb = NULL;
2635                         } else {
2636                                 __skb_unlink(skb, &sk->sk_receive_queue);
2637                                 WRITE_ONCE(u->oob_skb, NULL);
2638                                 unlinked_skb = skb;
2639                                 skb = skb_peek(&sk->sk_receive_queue);
2640                         }
2641                 }
2642
2643                 spin_unlock(&sk->sk_receive_queue.lock);
2644
2645                 if (unlinked_skb) {
2646                         WARN_ON_ONCE(skb_unref(unlinked_skb));
2647                         kfree_skb(unlinked_skb);
2648                 }
2649         }
2650         return skb;
2651 }
2652 #endif
2653
2654 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2655 {
2656         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2657                 return -ENOTCONN;
2658
2659         return unix_read_skb(sk, recv_actor);
2660 }
2661
2662 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2663                                     bool freezable)
2664 {
2665         struct scm_cookie scm;
2666         struct socket *sock = state->socket;
2667         struct sock *sk = sock->sk;
2668         struct unix_sock *u = unix_sk(sk);
2669         int copied = 0;
2670         int flags = state->flags;
2671         int noblock = flags & MSG_DONTWAIT;
2672         bool check_creds = false;
2673         int target;
2674         int err = 0;
2675         long timeo;
2676         int skip;
2677         size_t size = state->size;
2678         unsigned int last_len;
2679
2680         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2681                 err = -EINVAL;
2682                 goto out;
2683         }
2684
2685         if (unlikely(flags & MSG_OOB)) {
2686                 err = -EOPNOTSUPP;
2687 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2688                 err = unix_stream_recv_urg(state);
2689 #endif
2690                 goto out;
2691         }
2692
2693         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2694         timeo = sock_rcvtimeo(sk, noblock);
2695
2696         memset(&scm, 0, sizeof(scm));
2697
2698         /* Lock the socket to prevent queue disordering
2699          * while sleeps in memcpy_tomsg
2700          */
2701         mutex_lock(&u->iolock);
2702
2703         skip = max(sk_peek_offset(sk, flags), 0);
2704
2705         do {
2706                 int chunk;
2707                 bool drop_skb;
2708                 struct sk_buff *skb, *last;
2709
2710 redo:
2711                 unix_state_lock(sk);
2712                 if (sock_flag(sk, SOCK_DEAD)) {
2713                         err = -ECONNRESET;
2714                         goto unlock;
2715                 }
2716                 last = skb = skb_peek(&sk->sk_receive_queue);
2717                 last_len = last ? last->len : 0;
2718
2719 again:
2720 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2721                 if (skb) {
2722                         skb = manage_oob(skb, sk, flags, copied);
2723                         if (!skb && copied) {
2724                                 unix_state_unlock(sk);
2725                                 break;
2726                         }
2727                 }
2728 #endif
2729                 if (skb == NULL) {
2730                         if (copied >= target)
2731                                 goto unlock;
2732
2733                         /*
2734                          *      POSIX 1003.1g mandates this order.
2735                          */
2736
2737                         err = sock_error(sk);
2738                         if (err)
2739                                 goto unlock;
2740                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2741                                 goto unlock;
2742
2743                         unix_state_unlock(sk);
2744                         if (!timeo) {
2745                                 err = -EAGAIN;
2746                                 break;
2747                         }
2748
2749                         mutex_unlock(&u->iolock);
2750
2751                         timeo = unix_stream_data_wait(sk, timeo, last,
2752                                                       last_len, freezable);
2753
2754                         if (signal_pending(current)) {
2755                                 err = sock_intr_errno(timeo);
2756                                 scm_destroy(&scm);
2757                                 goto out;
2758                         }
2759
2760                         mutex_lock(&u->iolock);
2761                         goto redo;
2762 unlock:
2763                         unix_state_unlock(sk);
2764                         break;
2765                 }
2766
2767                 while (skip >= unix_skb_len(skb)) {
2768                         skip -= unix_skb_len(skb);
2769                         last = skb;
2770                         last_len = skb->len;
2771                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2772                         if (!skb)
2773                                 goto again;
2774                 }
2775
2776                 unix_state_unlock(sk);
2777
2778                 if (check_creds) {
2779                         /* Never glue messages from different writers */
2780                         if (!unix_skb_scm_eq(skb, &scm))
2781                                 break;
2782                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2783                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2784                         /* Copy credentials */
2785                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2786                         unix_set_secdata(&scm, skb);
2787                         check_creds = true;
2788                 }
2789
2790                 /* Copy address just once */
2791                 if (state->msg && state->msg->msg_name) {
2792                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2793                                          state->msg->msg_name);
2794                         unix_copy_addr(state->msg, skb->sk);
2795
2796                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2797                                                               state->msg->msg_name,
2798                                                               &state->msg->msg_namelen);
2799
2800                         sunaddr = NULL;
2801                 }
2802
2803                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2804                 skb_get(skb);
2805                 chunk = state->recv_actor(skb, skip, chunk, state);
2806                 drop_skb = !unix_skb_len(skb);
2807                 /* skb is only safe to use if !drop_skb */
2808                 consume_skb(skb);
2809                 if (chunk < 0) {
2810                         if (copied == 0)
2811                                 copied = -EFAULT;
2812                         break;
2813                 }
2814                 copied += chunk;
2815                 size -= chunk;
2816
2817                 if (drop_skb) {
2818                         /* the skb was touched by a concurrent reader;
2819                          * we should not expect anything from this skb
2820                          * anymore and assume it invalid - we can be
2821                          * sure it was dropped from the socket queue
2822                          *
2823                          * let's report a short read
2824                          */
2825                         err = 0;
2826                         break;
2827                 }
2828
2829                 /* Mark read part of skb as used */
2830                 if (!(flags & MSG_PEEK)) {
2831                         UNIXCB(skb).consumed += chunk;
2832
2833                         sk_peek_offset_bwd(sk, chunk);
2834
2835                         if (UNIXCB(skb).fp) {
2836                                 scm_stat_del(sk, skb);
2837                                 unix_detach_fds(&scm, skb);
2838                         }
2839
2840                         if (unix_skb_len(skb))
2841                                 break;
2842
2843                         skb_unlink(skb, &sk->sk_receive_queue);
2844                         consume_skb(skb);
2845
2846                         if (scm.fp)
2847                                 break;
2848                 } else {
2849                         /* It is questionable, see note in unix_dgram_recvmsg.
2850                          */
2851                         if (UNIXCB(skb).fp)
2852                                 unix_peek_fds(&scm, skb);
2853
2854                         sk_peek_offset_fwd(sk, chunk);
2855
2856                         if (UNIXCB(skb).fp)
2857                                 break;
2858
2859                         skip = 0;
2860                         last = skb;
2861                         last_len = skb->len;
2862                         unix_state_lock(sk);
2863                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2864                         if (skb)
2865                                 goto again;
2866                         unix_state_unlock(sk);
2867                         break;
2868                 }
2869         } while (size);
2870
2871         mutex_unlock(&u->iolock);
2872         if (state->msg)
2873                 scm_recv_unix(sock, state->msg, &scm, flags);
2874         else
2875                 scm_destroy(&scm);
2876 out:
2877         return copied ? : err;
2878 }
2879
2880 static int unix_stream_read_actor(struct sk_buff *skb,
2881                                   int skip, int chunk,
2882                                   struct unix_stream_read_state *state)
2883 {
2884         int ret;
2885
2886         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2887                                     state->msg, chunk);
2888         return ret ?: chunk;
2889 }
2890
2891 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2892                           size_t size, int flags)
2893 {
2894         struct unix_stream_read_state state = {
2895                 .recv_actor = unix_stream_read_actor,
2896                 .socket = sk->sk_socket,
2897                 .msg = msg,
2898                 .size = size,
2899                 .flags = flags
2900         };
2901
2902         return unix_stream_read_generic(&state, true);
2903 }
2904
2905 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2906                                size_t size, int flags)
2907 {
2908         struct unix_stream_read_state state = {
2909                 .recv_actor = unix_stream_read_actor,
2910                 .socket = sock,
2911                 .msg = msg,
2912                 .size = size,
2913                 .flags = flags
2914         };
2915
2916 #ifdef CONFIG_BPF_SYSCALL
2917         struct sock *sk = sock->sk;
2918         const struct proto *prot = READ_ONCE(sk->sk_prot);
2919
2920         if (prot != &unix_stream_proto)
2921                 return prot->recvmsg(sk, msg, size, flags, NULL);
2922 #endif
2923         return unix_stream_read_generic(&state, true);
2924 }
2925
2926 static int unix_stream_splice_actor(struct sk_buff *skb,
2927                                     int skip, int chunk,
2928                                     struct unix_stream_read_state *state)
2929 {
2930         return skb_splice_bits(skb, state->socket->sk,
2931                                UNIXCB(skb).consumed + skip,
2932                                state->pipe, chunk, state->splice_flags);
2933 }
2934
2935 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2936                                        struct pipe_inode_info *pipe,
2937                                        size_t size, unsigned int flags)
2938 {
2939         struct unix_stream_read_state state = {
2940                 .recv_actor = unix_stream_splice_actor,
2941                 .socket = sock,
2942                 .pipe = pipe,
2943                 .size = size,
2944                 .splice_flags = flags,
2945         };
2946
2947         if (unlikely(*ppos))
2948                 return -ESPIPE;
2949
2950         if (sock->file->f_flags & O_NONBLOCK ||
2951             flags & SPLICE_F_NONBLOCK)
2952                 state.flags = MSG_DONTWAIT;
2953
2954         return unix_stream_read_generic(&state, false);
2955 }
2956
2957 static int unix_shutdown(struct socket *sock, int mode)
2958 {
2959         struct sock *sk = sock->sk;
2960         struct sock *other;
2961
2962         if (mode < SHUT_RD || mode > SHUT_RDWR)
2963                 return -EINVAL;
2964         /* This maps:
2965          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2966          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2967          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2968          */
2969         ++mode;
2970
2971         unix_state_lock(sk);
2972         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2973         other = unix_peer(sk);
2974         if (other)
2975                 sock_hold(other);
2976         unix_state_unlock(sk);
2977         sk->sk_state_change(sk);
2978
2979         if (other &&
2980                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2981
2982                 int peer_mode = 0;
2983                 const struct proto *prot = READ_ONCE(other->sk_prot);
2984
2985                 if (prot->unhash)
2986                         prot->unhash(other);
2987                 if (mode&RCV_SHUTDOWN)
2988                         peer_mode |= SEND_SHUTDOWN;
2989                 if (mode&SEND_SHUTDOWN)
2990                         peer_mode |= RCV_SHUTDOWN;
2991                 unix_state_lock(other);
2992                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2993                 unix_state_unlock(other);
2994                 other->sk_state_change(other);
2995                 if (peer_mode == SHUTDOWN_MASK)
2996                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2997                 else if (peer_mode & RCV_SHUTDOWN)
2998                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2999         }
3000         if (other)
3001                 sock_put(other);
3002
3003         return 0;
3004 }
3005
3006 long unix_inq_len(struct sock *sk)
3007 {
3008         struct sk_buff *skb;
3009         long amount = 0;
3010
3011         if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3012                 return -EINVAL;
3013
3014         spin_lock(&sk->sk_receive_queue.lock);
3015         if (sk->sk_type == SOCK_STREAM ||
3016             sk->sk_type == SOCK_SEQPACKET) {
3017                 skb_queue_walk(&sk->sk_receive_queue, skb)
3018                         amount += unix_skb_len(skb);
3019         } else {
3020                 skb = skb_peek(&sk->sk_receive_queue);
3021                 if (skb)
3022                         amount = skb->len;
3023         }
3024         spin_unlock(&sk->sk_receive_queue.lock);
3025
3026         return amount;
3027 }
3028 EXPORT_SYMBOL_GPL(unix_inq_len);
3029
3030 long unix_outq_len(struct sock *sk)
3031 {
3032         return sk_wmem_alloc_get(sk);
3033 }
3034 EXPORT_SYMBOL_GPL(unix_outq_len);
3035
3036 static int unix_open_file(struct sock *sk)
3037 {
3038         struct path path;
3039         struct file *f;
3040         int fd;
3041
3042         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3043                 return -EPERM;
3044
3045         if (!smp_load_acquire(&unix_sk(sk)->addr))
3046                 return -ENOENT;
3047
3048         path = unix_sk(sk)->path;
3049         if (!path.dentry)
3050                 return -ENOENT;
3051
3052         path_get(&path);
3053
3054         fd = get_unused_fd_flags(O_CLOEXEC);
3055         if (fd < 0)
3056                 goto out;
3057
3058         f = dentry_open(&path, O_PATH, current_cred());
3059         if (IS_ERR(f)) {
3060                 put_unused_fd(fd);
3061                 fd = PTR_ERR(f);
3062                 goto out;
3063         }
3064
3065         fd_install(fd, f);
3066 out:
3067         path_put(&path);
3068
3069         return fd;
3070 }
3071
3072 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3073 {
3074         struct sock *sk = sock->sk;
3075         long amount = 0;
3076         int err;
3077
3078         switch (cmd) {
3079         case SIOCOUTQ:
3080                 amount = unix_outq_len(sk);
3081                 err = put_user(amount, (int __user *)arg);
3082                 break;
3083         case SIOCINQ:
3084                 amount = unix_inq_len(sk);
3085                 if (amount < 0)
3086                         err = amount;
3087                 else
3088                         err = put_user(amount, (int __user *)arg);
3089                 break;
3090         case SIOCUNIXFILE:
3091                 err = unix_open_file(sk);
3092                 break;
3093 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3094         case SIOCATMARK:
3095                 {
3096                         struct sk_buff *skb;
3097                         int answ = 0;
3098
3099                         skb = skb_peek(&sk->sk_receive_queue);
3100                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3101                                 answ = 1;
3102                         err = put_user(answ, (int __user *)arg);
3103                 }
3104                 break;
3105 #endif
3106         default:
3107                 err = -ENOIOCTLCMD;
3108                 break;
3109         }
3110         return err;
3111 }
3112
3113 #ifdef CONFIG_COMPAT
3114 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3115 {
3116         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3117 }
3118 #endif
3119
3120 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3121 {
3122         struct sock *sk = sock->sk;
3123         unsigned char state;
3124         __poll_t mask;
3125         u8 shutdown;
3126
3127         sock_poll_wait(file, sock, wait);
3128         mask = 0;
3129         shutdown = READ_ONCE(sk->sk_shutdown);
3130         state = READ_ONCE(sk->sk_state);
3131
3132         /* exceptional events? */
3133         if (READ_ONCE(sk->sk_err))
3134                 mask |= EPOLLERR;
3135         if (shutdown == SHUTDOWN_MASK)
3136                 mask |= EPOLLHUP;
3137         if (shutdown & RCV_SHUTDOWN)
3138                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3139
3140         /* readable? */
3141         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3142                 mask |= EPOLLIN | EPOLLRDNORM;
3143         if (sk_is_readable(sk))
3144                 mask |= EPOLLIN | EPOLLRDNORM;
3145 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3146         if (READ_ONCE(unix_sk(sk)->oob_skb))
3147                 mask |= EPOLLPRI;
3148 #endif
3149
3150         /* Connection-based need to check for termination and startup */
3151         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3152             state == TCP_CLOSE)
3153                 mask |= EPOLLHUP;
3154
3155         /*
3156          * we set writable also when the other side has shut down the
3157          * connection. This prevents stuck sockets.
3158          */
3159         if (unix_writable(sk, state))
3160                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3161
3162         return mask;
3163 }
3164
3165 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3166                                     poll_table *wait)
3167 {
3168         struct sock *sk = sock->sk, *other;
3169         unsigned int writable;
3170         unsigned char state;
3171         __poll_t mask;
3172         u8 shutdown;
3173
3174         sock_poll_wait(file, sock, wait);
3175         mask = 0;
3176         shutdown = READ_ONCE(sk->sk_shutdown);
3177         state = READ_ONCE(sk->sk_state);
3178
3179         /* exceptional events? */
3180         if (READ_ONCE(sk->sk_err) ||
3181             !skb_queue_empty_lockless(&sk->sk_error_queue))
3182                 mask |= EPOLLERR |
3183                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3184
3185         if (shutdown & RCV_SHUTDOWN)
3186                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3187         if (shutdown == SHUTDOWN_MASK)
3188                 mask |= EPOLLHUP;
3189
3190         /* readable? */
3191         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3192                 mask |= EPOLLIN | EPOLLRDNORM;
3193         if (sk_is_readable(sk))
3194                 mask |= EPOLLIN | EPOLLRDNORM;
3195
3196         /* Connection-based need to check for termination and startup */
3197         if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3198                 mask |= EPOLLHUP;
3199
3200         /* No write status requested, avoid expensive OUT tests. */
3201         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3202                 return mask;
3203
3204         writable = unix_writable(sk, state);
3205         if (writable) {
3206                 unix_state_lock(sk);
3207
3208                 other = unix_peer(sk);
3209                 if (other && unix_peer(other) != sk &&
3210                     unix_recvq_full_lockless(other) &&
3211                     unix_dgram_peer_wake_me(sk, other))
3212                         writable = 0;
3213
3214                 unix_state_unlock(sk);
3215         }
3216
3217         if (writable)
3218                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3219         else
3220                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3221
3222         return mask;
3223 }
3224
3225 #ifdef CONFIG_PROC_FS
3226
3227 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3228
3229 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3230 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3231 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3232
3233 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3234 {
3235         unsigned long offset = get_offset(*pos);
3236         unsigned long bucket = get_bucket(*pos);
3237         unsigned long count = 0;
3238         struct sock *sk;
3239
3240         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3241              sk; sk = sk_next(sk)) {
3242                 if (++count == offset)
3243                         break;
3244         }
3245
3246         return sk;
3247 }
3248
3249 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3250 {
3251         unsigned long bucket = get_bucket(*pos);
3252         struct net *net = seq_file_net(seq);
3253         struct sock *sk;
3254
3255         while (bucket < UNIX_HASH_SIZE) {
3256                 spin_lock(&net->unx.table.locks[bucket]);
3257
3258                 sk = unix_from_bucket(seq, pos);
3259                 if (sk)
3260                         return sk;
3261
3262                 spin_unlock(&net->unx.table.locks[bucket]);
3263
3264                 *pos = set_bucket_offset(++bucket, 1);
3265         }
3266
3267         return NULL;
3268 }
3269
3270 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3271                                   loff_t *pos)
3272 {
3273         unsigned long bucket = get_bucket(*pos);
3274
3275         sk = sk_next(sk);
3276         if (sk)
3277                 return sk;
3278
3279
3280         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3281
3282         *pos = set_bucket_offset(++bucket, 1);
3283
3284         return unix_get_first(seq, pos);
3285 }
3286
3287 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3288 {
3289         if (!*pos)
3290                 return SEQ_START_TOKEN;
3291
3292         return unix_get_first(seq, pos);
3293 }
3294
3295 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3296 {
3297         ++*pos;
3298
3299         if (v == SEQ_START_TOKEN)
3300                 return unix_get_first(seq, pos);
3301
3302         return unix_get_next(seq, v, pos);
3303 }
3304
3305 static void unix_seq_stop(struct seq_file *seq, void *v)
3306 {
3307         struct sock *sk = v;
3308
3309         if (sk)
3310                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3311 }
3312
3313 static int unix_seq_show(struct seq_file *seq, void *v)
3314 {
3315
3316         if (v == SEQ_START_TOKEN)
3317                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3318                          "Inode Path\n");
3319         else {
3320                 struct sock *s = v;
3321                 struct unix_sock *u = unix_sk(s);
3322                 unix_state_lock(s);
3323
3324                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3325                         s,
3326                         refcount_read(&s->sk_refcnt),
3327                         0,
3328                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3329                         s->sk_type,
3330                         s->sk_socket ?
3331                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3332                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3333                         sock_i_ino(s));
3334
3335                 if (u->addr) {  // under a hash table lock here
3336                         int i, len;
3337                         seq_putc(seq, ' ');
3338
3339                         i = 0;
3340                         len = u->addr->len -
3341                                 offsetof(struct sockaddr_un, sun_path);
3342                         if (u->addr->name->sun_path[0]) {
3343                                 len--;
3344                         } else {
3345                                 seq_putc(seq, '@');
3346                                 i++;
3347                         }
3348                         for ( ; i < len; i++)
3349                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3350                                          '@');
3351                 }
3352                 unix_state_unlock(s);
3353                 seq_putc(seq, '\n');
3354         }
3355
3356         return 0;
3357 }
3358
3359 static const struct seq_operations unix_seq_ops = {
3360         .start  = unix_seq_start,
3361         .next   = unix_seq_next,
3362         .stop   = unix_seq_stop,
3363         .show   = unix_seq_show,
3364 };
3365
3366 #ifdef CONFIG_BPF_SYSCALL
3367 struct bpf_unix_iter_state {
3368         struct seq_net_private p;
3369         unsigned int cur_sk;
3370         unsigned int end_sk;
3371         unsigned int max_sk;
3372         struct sock **batch;
3373         bool st_bucket_done;
3374 };
3375
3376 struct bpf_iter__unix {
3377         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3378         __bpf_md_ptr(struct unix_sock *, unix_sk);
3379         uid_t uid __aligned(8);
3380 };
3381
3382 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3383                               struct unix_sock *unix_sk, uid_t uid)
3384 {
3385         struct bpf_iter__unix ctx;
3386
3387         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3388         ctx.meta = meta;
3389         ctx.unix_sk = unix_sk;
3390         ctx.uid = uid;
3391         return bpf_iter_run_prog(prog, &ctx);
3392 }
3393
3394 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3395
3396 {
3397         struct bpf_unix_iter_state *iter = seq->private;
3398         unsigned int expected = 1;
3399         struct sock *sk;
3400
3401         sock_hold(start_sk);
3402         iter->batch[iter->end_sk++] = start_sk;
3403
3404         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3405                 if (iter->end_sk < iter->max_sk) {
3406                         sock_hold(sk);
3407                         iter->batch[iter->end_sk++] = sk;
3408                 }
3409
3410                 expected++;
3411         }
3412
3413         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3414
3415         return expected;
3416 }
3417
3418 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3419 {
3420         while (iter->cur_sk < iter->end_sk)
3421                 sock_put(iter->batch[iter->cur_sk++]);
3422 }
3423
3424 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3425                                        unsigned int new_batch_sz)
3426 {
3427         struct sock **new_batch;
3428
3429         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3430                              GFP_USER | __GFP_NOWARN);
3431         if (!new_batch)
3432                 return -ENOMEM;
3433
3434         bpf_iter_unix_put_batch(iter);
3435         kvfree(iter->batch);
3436         iter->batch = new_batch;
3437         iter->max_sk = new_batch_sz;
3438
3439         return 0;
3440 }
3441
3442 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3443                                         loff_t *pos)
3444 {
3445         struct bpf_unix_iter_state *iter = seq->private;
3446         unsigned int expected;
3447         bool resized = false;
3448         struct sock *sk;
3449
3450         if (iter->st_bucket_done)
3451                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3452
3453 again:
3454         /* Get a new batch */
3455         iter->cur_sk = 0;
3456         iter->end_sk = 0;
3457
3458         sk = unix_get_first(seq, pos);
3459         if (!sk)
3460                 return NULL; /* Done */
3461
3462         expected = bpf_iter_unix_hold_batch(seq, sk);
3463
3464         if (iter->end_sk == expected) {
3465                 iter->st_bucket_done = true;
3466                 return sk;
3467         }
3468
3469         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3470                 resized = true;
3471                 goto again;
3472         }
3473
3474         return sk;
3475 }
3476
3477 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3478 {
3479         if (!*pos)
3480                 return SEQ_START_TOKEN;
3481
3482         /* bpf iter does not support lseek, so it always
3483          * continue from where it was stop()-ped.
3484          */
3485         return bpf_iter_unix_batch(seq, pos);
3486 }
3487
3488 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3489 {
3490         struct bpf_unix_iter_state *iter = seq->private;
3491         struct sock *sk;
3492
3493         /* Whenever seq_next() is called, the iter->cur_sk is
3494          * done with seq_show(), so advance to the next sk in
3495          * the batch.
3496          */
3497         if (iter->cur_sk < iter->end_sk)
3498                 sock_put(iter->batch[iter->cur_sk++]);
3499
3500         ++*pos;
3501
3502         if (iter->cur_sk < iter->end_sk)
3503                 sk = iter->batch[iter->cur_sk];
3504         else
3505                 sk = bpf_iter_unix_batch(seq, pos);
3506
3507         return sk;
3508 }
3509
3510 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3511 {
3512         struct bpf_iter_meta meta;
3513         struct bpf_prog *prog;
3514         struct sock *sk = v;
3515         uid_t uid;
3516         bool slow;
3517         int ret;
3518
3519         if (v == SEQ_START_TOKEN)
3520                 return 0;
3521
3522         slow = lock_sock_fast(sk);
3523
3524         if (unlikely(sk_unhashed(sk))) {
3525                 ret = SEQ_SKIP;
3526                 goto unlock;
3527         }
3528
3529         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3530         meta.seq = seq;
3531         prog = bpf_iter_get_info(&meta, false);
3532         ret = unix_prog_seq_show(prog, &meta, v, uid);
3533 unlock:
3534         unlock_sock_fast(sk, slow);
3535         return ret;
3536 }
3537
3538 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3539 {
3540         struct bpf_unix_iter_state *iter = seq->private;
3541         struct bpf_iter_meta meta;
3542         struct bpf_prog *prog;
3543
3544         if (!v) {
3545                 meta.seq = seq;
3546                 prog = bpf_iter_get_info(&meta, true);
3547                 if (prog)
3548                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3549         }
3550
3551         if (iter->cur_sk < iter->end_sk)
3552                 bpf_iter_unix_put_batch(iter);
3553 }
3554
3555 static const struct seq_operations bpf_iter_unix_seq_ops = {
3556         .start  = bpf_iter_unix_seq_start,
3557         .next   = bpf_iter_unix_seq_next,
3558         .stop   = bpf_iter_unix_seq_stop,
3559         .show   = bpf_iter_unix_seq_show,
3560 };
3561 #endif
3562 #endif
3563
3564 static const struct net_proto_family unix_family_ops = {
3565         .family = PF_UNIX,
3566         .create = unix_create,
3567         .owner  = THIS_MODULE,
3568 };
3569
3570
3571 static int __net_init unix_net_init(struct net *net)
3572 {
3573         int i;
3574
3575         net->unx.sysctl_max_dgram_qlen = 10;
3576         if (unix_sysctl_register(net))
3577                 goto out;
3578
3579 #ifdef CONFIG_PROC_FS
3580         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3581                              sizeof(struct seq_net_private)))
3582                 goto err_sysctl;
3583 #endif
3584
3585         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3586                                               sizeof(spinlock_t), GFP_KERNEL);
3587         if (!net->unx.table.locks)
3588                 goto err_proc;
3589
3590         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3591                                                 sizeof(struct hlist_head),
3592                                                 GFP_KERNEL);
3593         if (!net->unx.table.buckets)
3594                 goto free_locks;
3595
3596         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3597                 spin_lock_init(&net->unx.table.locks[i]);
3598                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3599         }
3600
3601         return 0;
3602
3603 free_locks:
3604         kvfree(net->unx.table.locks);
3605 err_proc:
3606 #ifdef CONFIG_PROC_FS
3607         remove_proc_entry("unix", net->proc_net);
3608 err_sysctl:
3609 #endif
3610         unix_sysctl_unregister(net);
3611 out:
3612         return -ENOMEM;
3613 }
3614
3615 static void __net_exit unix_net_exit(struct net *net)
3616 {
3617         kvfree(net->unx.table.buckets);
3618         kvfree(net->unx.table.locks);
3619         unix_sysctl_unregister(net);
3620         remove_proc_entry("unix", net->proc_net);
3621 }
3622
3623 static struct pernet_operations unix_net_ops = {
3624         .init = unix_net_init,
3625         .exit = unix_net_exit,
3626 };
3627
3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3629 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3630                      struct unix_sock *unix_sk, uid_t uid)
3631
3632 #define INIT_BATCH_SZ 16
3633
3634 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3635 {
3636         struct bpf_unix_iter_state *iter = priv_data;
3637         int err;
3638
3639         err = bpf_iter_init_seq_net(priv_data, aux);
3640         if (err)
3641                 return err;
3642
3643         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3644         if (err) {
3645                 bpf_iter_fini_seq_net(priv_data);
3646                 return err;
3647         }
3648
3649         return 0;
3650 }
3651
3652 static void bpf_iter_fini_unix(void *priv_data)
3653 {
3654         struct bpf_unix_iter_state *iter = priv_data;
3655
3656         bpf_iter_fini_seq_net(priv_data);
3657         kvfree(iter->batch);
3658 }
3659
3660 static const struct bpf_iter_seq_info unix_seq_info = {
3661         .seq_ops                = &bpf_iter_unix_seq_ops,
3662         .init_seq_private       = bpf_iter_init_unix,
3663         .fini_seq_private       = bpf_iter_fini_unix,
3664         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3665 };
3666
3667 static const struct bpf_func_proto *
3668 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3669                              const struct bpf_prog *prog)
3670 {
3671         switch (func_id) {
3672         case BPF_FUNC_setsockopt:
3673                 return &bpf_sk_setsockopt_proto;
3674         case BPF_FUNC_getsockopt:
3675                 return &bpf_sk_getsockopt_proto;
3676         default:
3677                 return NULL;
3678         }
3679 }
3680
3681 static struct bpf_iter_reg unix_reg_info = {
3682         .target                 = "unix",
3683         .ctx_arg_info_size      = 1,
3684         .ctx_arg_info           = {
3685                 { offsetof(struct bpf_iter__unix, unix_sk),
3686                   PTR_TO_BTF_ID_OR_NULL },
3687         },
3688         .get_func_proto         = bpf_iter_unix_get_func_proto,
3689         .seq_info               = &unix_seq_info,
3690 };
3691
3692 static void __init bpf_iter_register(void)
3693 {
3694         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3695         if (bpf_iter_reg_target(&unix_reg_info))
3696                 pr_warn("Warning: could not register bpf iterator unix\n");
3697 }
3698 #endif
3699
3700 static int __init af_unix_init(void)
3701 {
3702         int i, rc = -1;
3703
3704         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3705
3706         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3707                 spin_lock_init(&bsd_socket_locks[i]);
3708                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3709         }
3710
3711         rc = proto_register(&unix_dgram_proto, 1);
3712         if (rc != 0) {
3713                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3714                 goto out;
3715         }
3716
3717         rc = proto_register(&unix_stream_proto, 1);
3718         if (rc != 0) {
3719                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3720                 proto_unregister(&unix_dgram_proto);
3721                 goto out;
3722         }
3723
3724         sock_register(&unix_family_ops);
3725         register_pernet_subsys(&unix_net_ops);
3726         unix_bpf_build_proto();
3727
3728 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3729         bpf_iter_register();
3730 #endif
3731
3732 out:
3733         return rc;
3734 }
3735
3736 /* Later than subsys_initcall() because we depend on stuff initialised there */
3737 fs_initcall(af_unix_init);