net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <[email protected]>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/freezer.h>
 116 #include <linux/file.h>
 117 #include <linux/btf_ids.h>
 118
 119 #include "scm.h"
 120
 121 static atomic_long_t unix_nr_socks;
 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 124
 125 /* SMP locking strategy:
 126  *    hash table is protected with spinlock.
 127  *    each socket state is protected by separate spinlock.
 128  */
 129
 130 static unsigned int unix_unbound_hash(struct sock *sk)
 131 {
 132         unsigned long hash = (unsigned long)sk;
 133
 134         hash ^= hash >> 16;
 135         hash ^= hash >> 8;
 136         hash ^= sk->sk_type;
 137
 138         return hash & UNIX_HASH_MOD;
 139 }
 140
 141 static unsigned int unix_bsd_hash(struct inode *i)
 142 {
 143         return i->i_ino & UNIX_HASH_MOD;
 144 }
 145
 146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 147                                        int addr_len, int type)
 148 {
 149         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 150         unsigned int hash;
 151
 152         hash = (__force unsigned int)csum_fold(csum);
 153         hash ^= hash >> 8;
 154         hash ^= type;
 155
 156         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 157 }
 158
 159 static void unix_table_double_lock(struct net *net,
 160                                    unsigned int hash1, unsigned int hash2)
 161 {
 162         if (hash1 == hash2) {
 163                 spin_lock(&net->unx.table.locks[hash1]);
 164                 return;
 165         }
 166
 167         if (hash1 > hash2)
 168                 swap(hash1, hash2);
 169
 170         spin_lock(&net->unx.table.locks[hash1]);
 171         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 172 }
 173
 174 static void unix_table_double_unlock(struct net *net,
 175                                      unsigned int hash1, unsigned int hash2)
 176 {
 177         if (hash1 == hash2) {
 178                 spin_unlock(&net->unx.table.locks[hash1]);
 179                 return;
 180         }
 181
 182         spin_unlock(&net->unx.table.locks[hash1]);
 183         spin_unlock(&net->unx.table.locks[hash2]);
 184 }
 185
 186 #ifdef CONFIG_SECURITY_NETWORK
 187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 188 {
 189         UNIXCB(skb).secid = scm->secid;
 190 }
 191
 192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 193 {
 194         scm->secid = UNIXCB(skb).secid;
 195 }
 196
 197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 198 {
 199         return (scm->secid == UNIXCB(skb).secid);
 200 }
 201 #else
 202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 203 { }
 204
 205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 206 { }
 207
 208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 209 {
 210         return true;
 211 }
 212 #endif /* CONFIG_SECURITY_NETWORK */
 213
 214 #define unix_peer(sk) (unix_sk(sk)->peer)
 215
 216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 217 {
 218         return unix_peer(osk) == sk;
 219 }
 220
 221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 222 {
 223         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 224 }
 225
 226 static inline int unix_recvq_full(const struct sock *sk)
 227 {
 228         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 229 }
 230
 231 static inline int unix_recvq_full_lockless(const struct sock *sk)
 232 {
 233         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 234                 READ_ONCE(sk->sk_max_ack_backlog);
 235 }
 236
 237 struct sock *unix_peer_get(struct sock *s)
 238 {
 239         struct sock *peer;
 240
 241         unix_state_lock(s);
 242         peer = unix_peer(s);
 243         if (peer)
 244                 sock_hold(peer);
 245         unix_state_unlock(s);
 246         return peer;
 247 }
 248 EXPORT_SYMBOL_GPL(unix_peer_get);
 249
 250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 251                                              int addr_len)
 252 {
 253         struct unix_address *addr;
 254
 255         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 256         if (!addr)
 257                 return NULL;
 258
 259         refcount_set(&addr->refcnt, 1);
 260         addr->len = addr_len;
 261         memcpy(addr->name, sunaddr, addr_len);
 262
 263         return addr;
 264 }
 265
 266 static inline void unix_release_addr(struct unix_address *addr)
 267 {
 268         if (refcount_dec_and_test(&addr->refcnt))
 269                 kfree(addr);
 270 }
 271
 272 /*
 273  *      Check unix socket name:
 274  *              - should be not zero length.
 275  *              - if started by not zero, should be NULL terminated (FS object)
 276  *              - if started by zero, it is abstract name.
 277  */
 278
 279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 280 {
 281         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 282             addr_len > sizeof(*sunaddr))
 283                 return -EINVAL;
 284
 285         if (sunaddr->sun_family != AF_UNIX)
 286                 return -EINVAL;
 287
 288         return 0;
 289 }
 290
 291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 292 {
 293         /* This may look like an off by one error but it is a bit more
 294          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 295          * sun_path[108] doesn't as such exist.  However in kernel space
 296          * we are guaranteed that it is a valid memory location in our
 297          * kernel address buffer because syscall functions always pass
 298          * a pointer of struct sockaddr_storage which has a bigger buffer
 299          * than 108.
 300          */
 301         ((char *)sunaddr)[addr_len] = 0;
 302 }
 303
 304 static void __unix_remove_socket(struct sock *sk)
 305 {
 306         sk_del_node_init(sk);
 307 }
 308
 309 static void __unix_insert_socket(struct net *net, struct sock *sk)
 310 {
 311         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 312         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 313 }
 314
 315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 316                                  struct unix_address *addr, unsigned int hash)
 317 {
 318         __unix_remove_socket(sk);
 319         smp_store_release(&unix_sk(sk)->addr, addr);
 320
 321         sk->sk_hash = hash;
 322         __unix_insert_socket(net, sk);
 323 }
 324
 325 static void unix_remove_socket(struct net *net, struct sock *sk)
 326 {
 327         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 328         __unix_remove_socket(sk);
 329         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 330 }
 331
 332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 333 {
 334         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 335         __unix_insert_socket(net, sk);
 336         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 337 }
 338
 339 static void unix_insert_bsd_socket(struct sock *sk)
 340 {
 341         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 342         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 343         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 344 }
 345
 346 static void unix_remove_bsd_socket(struct sock *sk)
 347 {
 348         if (!hlist_unhashed(&sk->sk_bind_node)) {
 349                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 350                 __sk_del_bind_node(sk);
 351                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 352
 353                 sk_node_init(&sk->sk_bind_node);
 354         }
 355 }
 356
 357 static struct sock *__unix_find_socket_byname(struct net *net,
 358                                               struct sockaddr_un *sunname,
 359                                               int len, unsigned int hash)
 360 {
 361         struct sock *s;
 362
 363         sk_for_each(s, &net->unx.table.buckets[hash]) {
 364                 struct unix_sock *u = unix_sk(s);
 365
 366                 if (u->addr->len == len &&
 367                     !memcmp(u->addr->name, sunname, len))
 368                         return s;
 369         }
 370         return NULL;
 371 }
 372
 373 static inline struct sock *unix_find_socket_byname(struct net *net,
 374                                                    struct sockaddr_un *sunname,
 375                                                    int len, unsigned int hash)
 376 {
 377         struct sock *s;
 378
 379         spin_lock(&net->unx.table.locks[hash]);
 380         s = __unix_find_socket_byname(net, sunname, len, hash);
 381         if (s)
 382                 sock_hold(s);
 383         spin_unlock(&net->unx.table.locks[hash]);
 384         return s;
 385 }
 386
 387 static struct sock *unix_find_socket_byinode(struct inode *i)
 388 {
 389         unsigned int hash = unix_bsd_hash(i);
 390         struct sock *s;
 391
 392         spin_lock(&bsd_socket_locks[hash]);
 393         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 394                 struct dentry *dentry = unix_sk(s)->path.dentry;
 395
 396                 if (dentry && d_backing_inode(dentry) == i) {
 397                         sock_hold(s);
 398                         spin_unlock(&bsd_socket_locks[hash]);
 399                         return s;
 400                 }
 401         }
 402         spin_unlock(&bsd_socket_locks[hash]);
 403         return NULL;
 404 }
 405
 406 /* Support code for asymmetrically connected dgram sockets
 407  *
 408  * If a datagram socket is connected to a socket not itself connected
 409  * to the first socket (eg, /dev/log), clients may only enqueue more
 410  * messages if the present receive queue of the server socket is not
 411  * "too large". This means there's a second writeability condition
 412  * poll and sendmsg need to test. The dgram recv code will do a wake
 413  * up on the peer_wait wait queue of a socket upon reception of a
 414  * datagram which needs to be propagated to sleeping would-be writers
 415  * since these might not have sent anything so far. This can't be
 416  * accomplished via poll_wait because the lifetime of the server
 417  * socket might be less than that of its clients if these break their
 418  * association with it or if the server socket is closed while clients
 419  * are still connected to it and there's no way to inform "a polling
 420  * implementation" that it should let go of a certain wait queue
 421  *
 422  * In order to propagate a wake up, a wait_queue_entry_t of the client
 423  * socket is enqueued on the peer_wait queue of the server socket
 424  * whose wake function does a wake_up on the ordinary client socket
 425  * wait queue. This connection is established whenever a write (or
 426  * poll for write) hit the flow control condition and broken when the
 427  * association to the server socket is dissolved or after a wake up
 428  * was relayed.
 429  */
 430
 431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 432                                       void *key)
 433 {
 434         struct unix_sock *u;
 435         wait_queue_head_t *u_sleep;
 436
 437         u = container_of(q, struct unix_sock, peer_wake);
 438
 439         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 440                             q);
 441         u->peer_wake.private = NULL;
 442
 443         /* relaying can only happen while the wq still exists */
 444         u_sleep = sk_sleep(&u->sk);
 445         if (u_sleep)
 446                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 447
 448         return 0;
 449 }
 450
 451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 452 {
 453         struct unix_sock *u, *u_other;
 454         int rc;
 455
 456         u = unix_sk(sk);
 457         u_other = unix_sk(other);
 458         rc = 0;
 459         spin_lock(&u_other->peer_wait.lock);
 460
 461         if (!u->peer_wake.private) {
 462                 u->peer_wake.private = other;
 463                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 464
 465                 rc = 1;
 466         }
 467
 468         spin_unlock(&u_other->peer_wait.lock);
 469         return rc;
 470 }
 471
 472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 473                                             struct sock *other)
 474 {
 475         struct unix_sock *u, *u_other;
 476
 477         u = unix_sk(sk);
 478         u_other = unix_sk(other);
 479         spin_lock(&u_other->peer_wait.lock);
 480
 481         if (u->peer_wake.private == other) {
 482                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 483                 u->peer_wake.private = NULL;
 484         }
 485
 486         spin_unlock(&u_other->peer_wait.lock);
 487 }
 488
 489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 490                                                    struct sock *other)
 491 {
 492         unix_dgram_peer_wake_disconnect(sk, other);
 493         wake_up_interruptible_poll(sk_sleep(sk),
 494                                    EPOLLOUT |
 495                                    EPOLLWRNORM |
 496                                    EPOLLWRBAND);
 497 }
 498
 499 /* preconditions:
 500  *      - unix_peer(sk) == other
 501  *      - association is stable
 502  */
 503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 504 {
 505         int connected;
 506
 507         connected = unix_dgram_peer_wake_connect(sk, other);
 508
 509         /* If other is SOCK_DEAD, we want to make sure we signal
 510          * POLLOUT, such that a subsequent write() can get a
 511          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 512          * to other and its full, we will hang waiting for POLLOUT.
 513          */
 514         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 515                 return 1;
 516
 517         if (connected)
 518                 unix_dgram_peer_wake_disconnect(sk, other);
 519
 520         return 0;
 521 }
 522
 523 static int unix_writable(const struct sock *sk)
 524 {
 525         return sk->sk_state != TCP_LISTEN &&
 526                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 527 }
 528
 529 static void unix_write_space(struct sock *sk)
 530 {
 531         struct socket_wq *wq;
 532
 533         rcu_read_lock();
 534         if (unix_writable(sk)) {
 535                 wq = rcu_dereference(sk->sk_wq);
 536                 if (skwq_has_sleeper(wq))
 537                         wake_up_interruptible_sync_poll(&wq->wait,
 538                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 539                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 540         }
 541         rcu_read_unlock();
 542 }
 543
 544 /* When dgram socket disconnects (or changes its peer), we clear its receive
 545  * queue of packets arrived from previous peer. First, it allows to do
 546  * flow control based only on wmem_alloc; second, sk connected to peer
 547  * may receive messages only from that peer. */
 548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 549 {
 550         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 551                 skb_queue_purge(&sk->sk_receive_queue);
 552                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 553
 554                 /* If one link of bidirectional dgram pipe is disconnected,
 555                  * we signal error. Messages are lost. Do not make this,
 556                  * when peer was not connected to us.
 557                  */
 558                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 559                         other->sk_err = ECONNRESET;
 560                         sk_error_report(other);
 561                 }
 562         }
 563         other->sk_state = TCP_CLOSE;
 564 }
 565
 566 static void unix_sock_destructor(struct sock *sk)
 567 {
 568         struct unix_sock *u = unix_sk(sk);
 569
 570         skb_queue_purge(&sk->sk_receive_queue);
 571
 572         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 573         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 574         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 575         if (!sock_flag(sk, SOCK_DEAD)) {
 576                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 577                 return;
 578         }
 579
 580         if (u->addr)
 581                 unix_release_addr(u->addr);
 582
 583         atomic_long_dec(&unix_nr_socks);
 584         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 585 #ifdef UNIX_REFCNT_DEBUG
 586         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 587                 atomic_long_read(&unix_nr_socks));
 588 #endif
 589 }
 590
 591 static void unix_release_sock(struct sock *sk, int embrion)
 592 {
 593         struct unix_sock *u = unix_sk(sk);
 594         struct sock *skpair;
 595         struct sk_buff *skb;
 596         struct path path;
 597         int state;
 598
 599         unix_remove_socket(sock_net(sk), sk);
 600         unix_remove_bsd_socket(sk);
 601
 602         /* Clear state */
 603         unix_state_lock(sk);
 604         sock_orphan(sk);
 605         sk->sk_shutdown = SHUTDOWN_MASK;
 606         path         = u->path;
 607         u->path.dentry = NULL;
 608         u->path.mnt = NULL;
 609         state = sk->sk_state;
 610         sk->sk_state = TCP_CLOSE;
 611
 612         skpair = unix_peer(sk);
 613         unix_peer(sk) = NULL;
 614
 615         unix_state_unlock(sk);
 616
 617 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 618         if (u->oob_skb) {
 619                 kfree_skb(u->oob_skb);
 620                 u->oob_skb = NULL;
 621         }
 622 #endif
 623
 624         wake_up_interruptible_all(&u->peer_wait);
 625
 626         if (skpair != NULL) {
 627                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 628                         unix_state_lock(skpair);
 629                         /* No more writes */
 630                         skpair->sk_shutdown = SHUTDOWN_MASK;
 631                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 632                                 skpair->sk_err = ECONNRESET;
 633                         unix_state_unlock(skpair);
 634                         skpair->sk_state_change(skpair);
 635                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 636                 }
 637
 638                 unix_dgram_peer_wake_disconnect(sk, skpair);
 639                 sock_put(skpair); /* It may now die */
 640         }
 641
 642         /* Try to flush out this socket. Throw out buffers at least */
 643
 644         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 645                 if (state == TCP_LISTEN)
 646                         unix_release_sock(skb->sk, 1);
 647                 /* passed fds are erased in the kfree_skb hook        */
 648                 UNIXCB(skb).consumed = skb->len;
 649                 kfree_skb(skb);
 650         }
 651
 652         if (path.dentry)
 653                 path_put(&path);
 654
 655         sock_put(sk);
 656
 657         /* ---- Socket is dead now and most probably destroyed ---- */
 658
 659         /*
 660          * Fixme: BSD difference: In BSD all sockets connected to us get
 661          *        ECONNRESET and we die on the spot. In Linux we behave
 662          *        like files and pipes do and wait for the last
 663          *        dereference.
 664          *
 665          * Can't we simply set sock->err?
 666          *
 667          *        What the above comment does talk about? --ANK(980817)
 668          */
 669
 670         if (unix_tot_inflight)
 671                 unix_gc();              /* Garbage collect fds */
 672 }
 673
 674 static void init_peercred(struct sock *sk)
 675 {
 676         const struct cred *old_cred;
 677         struct pid *old_pid;
 678
 679         spin_lock(&sk->sk_peer_lock);
 680         old_pid = sk->sk_peer_pid;
 681         old_cred = sk->sk_peer_cred;
 682         sk->sk_peer_pid  = get_pid(task_tgid(current));
 683         sk->sk_peer_cred = get_current_cred();
 684         spin_unlock(&sk->sk_peer_lock);
 685
 686         put_pid(old_pid);
 687         put_cred(old_cred);
 688 }
 689
 690 static void copy_peercred(struct sock *sk, struct sock *peersk)
 691 {
 692         const struct cred *old_cred;
 693         struct pid *old_pid;
 694
 695         if (sk < peersk) {
 696                 spin_lock(&sk->sk_peer_lock);
 697                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 698         } else {
 699                 spin_lock(&peersk->sk_peer_lock);
 700                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 701         }
 702         old_pid = sk->sk_peer_pid;
 703         old_cred = sk->sk_peer_cred;
 704         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 705         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 706
 707         spin_unlock(&sk->sk_peer_lock);
 708         spin_unlock(&peersk->sk_peer_lock);
 709
 710         put_pid(old_pid);
 711         put_cred(old_cred);
 712 }
 713
 714 static int unix_listen(struct socket *sock, int backlog)
 715 {
 716         int err;
 717         struct sock *sk = sock->sk;
 718         struct unix_sock *u = unix_sk(sk);
 719
 720         err = -EOPNOTSUPP;
 721         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 722                 goto out;       /* Only stream/seqpacket sockets accept */
 723         err = -EINVAL;
 724         if (!u->addr)
 725                 goto out;       /* No listens on an unbound socket */
 726         unix_state_lock(sk);
 727         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 728                 goto out_unlock;
 729         if (backlog > sk->sk_max_ack_backlog)
 730                 wake_up_interruptible_all(&u->peer_wait);
 731         sk->sk_max_ack_backlog  = backlog;
 732         sk->sk_state            = TCP_LISTEN;
 733         /* set credentials so connect can copy them */
 734         init_peercred(sk);
 735         err = 0;
 736
 737 out_unlock:
 738         unix_state_unlock(sk);
 739 out:
 740         return err;
 741 }
 742
 743 static int unix_release(struct socket *);
 744 static int unix_bind(struct socket *, struct sockaddr *, int);
 745 static int unix_stream_connect(struct socket *, struct sockaddr *,
 746                                int addr_len, int flags);
 747 static int unix_socketpair(struct socket *, struct socket *);
 748 static int unix_accept(struct socket *, struct socket *, int, bool);
 749 static int unix_getname(struct socket *, struct sockaddr *, int);
 750 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 751 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 752                                     poll_table *);
 753 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 754 #ifdef CONFIG_COMPAT
 755 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 756 #endif
 757 static int unix_shutdown(struct socket *, int);
 758 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 759 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 760 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 761                                     size_t size, int flags);
 762 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 763                                        struct pipe_inode_info *, size_t size,
 764                                        unsigned int flags);
 765 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 766 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 767 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 768 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 769 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 770                               int, int);
 771 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 772 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 773                                   int);
 774
 775 static int unix_set_peek_off(struct sock *sk, int val)
 776 {
 777         struct unix_sock *u = unix_sk(sk);
 778
 779         if (mutex_lock_interruptible(&u->iolock))
 780                 return -EINTR;
 781
 782         sk->sk_peek_off = val;
 783         mutex_unlock(&u->iolock);
 784
 785         return 0;
 786 }
 787
 788 #ifdef CONFIG_PROC_FS
 789 static int unix_count_nr_fds(struct sock *sk)
 790 {
 791         struct sk_buff *skb;
 792         struct unix_sock *u;
 793         int nr_fds = 0;
 794
 795         spin_lock(&sk->sk_receive_queue.lock);
 796         skb = skb_peek(&sk->sk_receive_queue);
 797         while (skb) {
 798                 u = unix_sk(skb->sk);
 799                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 800                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 801         }
 802         spin_unlock(&sk->sk_receive_queue.lock);
 803
 804         return nr_fds;
 805 }
 806
 807 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 808 {
 809         struct sock *sk = sock->sk;
 810         struct unix_sock *u;
 811         int nr_fds;
 812
 813         if (sk) {
 814                 u = unix_sk(sk);
 815                 if (sock->type == SOCK_DGRAM) {
 816                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 817                         goto out_print;
 818                 }
 819
 820                 unix_state_lock(sk);
 821                 if (sk->sk_state != TCP_LISTEN)
 822                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 823                 else
 824                         nr_fds = unix_count_nr_fds(sk);
 825                 unix_state_unlock(sk);
 826 out_print:
 827                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 828         }
 829 }
 830 #else
 831 #define unix_show_fdinfo NULL
 832 #endif
 833
 834 static const struct proto_ops unix_stream_ops = {
 835         .family =       PF_UNIX,
 836         .owner =        THIS_MODULE,
 837         .release =      unix_release,
 838         .bind =         unix_bind,
 839         .connect =      unix_stream_connect,
 840         .socketpair =   unix_socketpair,
 841         .accept =       unix_accept,
 842         .getname =      unix_getname,
 843         .poll =         unix_poll,
 844         .ioctl =        unix_ioctl,
 845 #ifdef CONFIG_COMPAT
 846         .compat_ioctl = unix_compat_ioctl,
 847 #endif
 848         .listen =       unix_listen,
 849         .shutdown =     unix_shutdown,
 850         .sendmsg =      unix_stream_sendmsg,
 851         .recvmsg =      unix_stream_recvmsg,
 852         .read_skb =     unix_stream_read_skb,
 853         .mmap =         sock_no_mmap,
 854         .sendpage =     unix_stream_sendpage,
 855         .splice_read =  unix_stream_splice_read,
 856         .set_peek_off = unix_set_peek_off,
 857         .show_fdinfo =  unix_show_fdinfo,
 858 };
 859
 860 static const struct proto_ops unix_dgram_ops = {
 861         .family =       PF_UNIX,
 862         .owner =        THIS_MODULE,
 863         .release =      unix_release,
 864         .bind =         unix_bind,
 865         .connect =      unix_dgram_connect,
 866         .socketpair =   unix_socketpair,
 867         .accept =       sock_no_accept,
 868         .getname =      unix_getname,
 869         .poll =         unix_dgram_poll,
 870         .ioctl =        unix_ioctl,
 871 #ifdef CONFIG_COMPAT
 872         .compat_ioctl = unix_compat_ioctl,
 873 #endif
 874         .listen =       sock_no_listen,
 875         .shutdown =     unix_shutdown,
 876         .sendmsg =      unix_dgram_sendmsg,
 877         .read_skb =     unix_read_skb,
 878         .recvmsg =      unix_dgram_recvmsg,
 879         .mmap =         sock_no_mmap,
 880         .sendpage =     sock_no_sendpage,
 881         .set_peek_off = unix_set_peek_off,
 882         .show_fdinfo =  unix_show_fdinfo,
 883 };
 884
 885 static const struct proto_ops unix_seqpacket_ops = {
 886         .family =       PF_UNIX,
 887         .owner =        THIS_MODULE,
 888         .release =      unix_release,
 889         .bind =         unix_bind,
 890         .connect =      unix_stream_connect,
 891         .socketpair =   unix_socketpair,
 892         .accept =       unix_accept,
 893         .getname =      unix_getname,
 894         .poll =         unix_dgram_poll,
 895         .ioctl =        unix_ioctl,
 896 #ifdef CONFIG_COMPAT
 897         .compat_ioctl = unix_compat_ioctl,
 898 #endif
 899         .listen =       unix_listen,
 900         .shutdown =     unix_shutdown,
 901         .sendmsg =      unix_seqpacket_sendmsg,
 902         .recvmsg =      unix_seqpacket_recvmsg,
 903         .mmap =         sock_no_mmap,
 904         .sendpage =     sock_no_sendpage,
 905         .set_peek_off = unix_set_peek_off,
 906         .show_fdinfo =  unix_show_fdinfo,
 907 };
 908
 909 static void unix_close(struct sock *sk, long timeout)
 910 {
 911         /* Nothing to do here, unix socket does not need a ->close().
 912          * This is merely for sockmap.
 913          */
 914 }
 915
 916 static void unix_unhash(struct sock *sk)
 917 {
 918         /* Nothing to do here, unix socket does not need a ->unhash().
 919          * This is merely for sockmap.
 920          */
 921 }
 922
 923 struct proto unix_dgram_proto = {
 924         .name                   = "UNIX",
 925         .owner                  = THIS_MODULE,
 926         .obj_size               = sizeof(struct unix_sock),
 927         .close                  = unix_close,
 928 #ifdef CONFIG_BPF_SYSCALL
 929         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 930 #endif
 931 };
 932
 933 struct proto unix_stream_proto = {
 934         .name                   = "UNIX-STREAM",
 935         .owner                  = THIS_MODULE,
 936         .obj_size               = sizeof(struct unix_sock),
 937         .close                  = unix_close,
 938         .unhash                 = unix_unhash,
 939 #ifdef CONFIG_BPF_SYSCALL
 940         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 941 #endif
 942 };
 943
 944 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 945 {
 946         struct unix_sock *u;
 947         struct sock *sk;
 948         int err;
 949
 950         atomic_long_inc(&unix_nr_socks);
 951         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 952                 err = -ENFILE;
 953                 goto err;
 954         }
 955
 956         if (type == SOCK_STREAM)
 957                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 958         else /*dgram and  seqpacket */
 959                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 960
 961         if (!sk) {
 962                 err = -ENOMEM;
 963                 goto err;
 964         }
 965
 966         sock_init_data(sock, sk);
 967
 968         sk->sk_hash             = unix_unbound_hash(sk);
 969         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 970         sk->sk_write_space      = unix_write_space;
 971         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 972         sk->sk_destruct         = unix_sock_destructor;
 973         u         = unix_sk(sk);
 974         u->path.dentry = NULL;
 975         u->path.mnt = NULL;
 976         spin_lock_init(&u->lock);
 977         atomic_long_set(&u->inflight, 0);
 978         INIT_LIST_HEAD(&u->link);
 979         mutex_init(&u->iolock); /* single task reading lock */
 980         mutex_init(&u->bindlock); /* single task binding lock */
 981         init_waitqueue_head(&u->peer_wait);
 982         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 983         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 984         unix_insert_unbound_socket(net, sk);
 985
 986         sock_prot_inuse_add(net, sk->sk_prot, 1);
 987
 988         return sk;
 989
 990 err:
 991         atomic_long_dec(&unix_nr_socks);
 992         return ERR_PTR(err);
 993 }
 994
 995 static int unix_create(struct net *net, struct socket *sock, int protocol,
 996                        int kern)
 997 {
 998         struct sock *sk;
 999
1000         if (protocol && protocol != PF_UNIX)
1001                 return -EPROTONOSUPPORT;
1002
1003         sock->state = SS_UNCONNECTED;
1004
1005         switch (sock->type) {
1006         case SOCK_STREAM:
1007                 sock->ops = &unix_stream_ops;
1008                 break;
1009                 /*
1010                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1011                  *      nothing uses it.
1012                  */
1013         case SOCK_RAW:
1014                 sock->type = SOCK_DGRAM;
1015                 fallthrough;
1016         case SOCK_DGRAM:
1017                 sock->ops = &unix_dgram_ops;
1018                 break;
1019         case SOCK_SEQPACKET:
1020                 sock->ops = &unix_seqpacket_ops;
1021                 break;
1022         default:
1023                 return -ESOCKTNOSUPPORT;
1024         }
1025
1026         sk = unix_create1(net, sock, kern, sock->type);
1027         if (IS_ERR(sk))
1028                 return PTR_ERR(sk);
1029
1030         return 0;
1031 }
1032
1033 static int unix_release(struct socket *sock)
1034 {
1035         struct sock *sk = sock->sk;
1036
1037         if (!sk)
1038                 return 0;
1039
1040         sk->sk_prot->close(sk, 0);
1041         unix_release_sock(sk, 0);
1042         sock->sk = NULL;
1043
1044         return 0;
1045 }
1046
1047 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1048                                   int type)
1049 {
1050         struct inode *inode;
1051         struct path path;
1052         struct sock *sk;
1053         int err;
1054
1055         unix_mkname_bsd(sunaddr, addr_len);
1056         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1057         if (err)
1058                 goto fail;
1059
1060         err = path_permission(&path, MAY_WRITE);
1061         if (err)
1062                 goto path_put;
1063
1064         err = -ECONNREFUSED;
1065         inode = d_backing_inode(path.dentry);
1066         if (!S_ISSOCK(inode->i_mode))
1067                 goto path_put;
1068
1069         sk = unix_find_socket_byinode(inode);
1070         if (!sk)
1071                 goto path_put;
1072
1073         err = -EPROTOTYPE;
1074         if (sk->sk_type == type)
1075                 touch_atime(&path);
1076         else
1077                 goto sock_put;
1078
1079         path_put(&path);
1080
1081         return sk;
1082
1083 sock_put:
1084         sock_put(sk);
1085 path_put:
1086         path_put(&path);
1087 fail:
1088         return ERR_PTR(err);
1089 }
1090
1091 static struct sock *unix_find_abstract(struct net *net,
1092                                        struct sockaddr_un *sunaddr,
1093                                        int addr_len, int type)
1094 {
1095         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1096         struct dentry *dentry;
1097         struct sock *sk;
1098
1099         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1100         if (!sk)
1101                 return ERR_PTR(-ECONNREFUSED);
1102
1103         dentry = unix_sk(sk)->path.dentry;
1104         if (dentry)
1105                 touch_atime(&unix_sk(sk)->path);
1106
1107         return sk;
1108 }
1109
1110 static struct sock *unix_find_other(struct net *net,
1111                                     struct sockaddr_un *sunaddr,
1112                                     int addr_len, int type)
1113 {
1114         struct sock *sk;
1115
1116         if (sunaddr->sun_path[0])
1117                 sk = unix_find_bsd(sunaddr, addr_len, type);
1118         else
1119                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1120
1121         return sk;
1122 }
1123
1124 static int unix_autobind(struct sock *sk)
1125 {
1126         unsigned int new_hash, old_hash = sk->sk_hash;
1127         struct unix_sock *u = unix_sk(sk);
1128         struct net *net = sock_net(sk);
1129         struct unix_address *addr;
1130         u32 lastnum, ordernum;
1131         int err;
1132
1133         err = mutex_lock_interruptible(&u->bindlock);
1134         if (err)
1135                 return err;
1136
1137         if (u->addr)
1138                 goto out;
1139
1140         err = -ENOMEM;
1141         addr = kzalloc(sizeof(*addr) +
1142                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1143         if (!addr)
1144                 goto out;
1145
1146         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1147         addr->name->sun_family = AF_UNIX;
1148         refcount_set(&addr->refcnt, 1);
1149
1150         ordernum = get_random_u32();
1151         lastnum = ordernum & 0xFFFFF;
1152 retry:
1153         ordernum = (ordernum + 1) & 0xFFFFF;
1154         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1155
1156         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1157         unix_table_double_lock(net, old_hash, new_hash);
1158
1159         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1160                 unix_table_double_unlock(net, old_hash, new_hash);
1161
1162                 /* __unix_find_socket_byname() may take long time if many names
1163                  * are already in use.
1164                  */
1165                 cond_resched();
1166
1167                 if (ordernum == lastnum) {
1168                         /* Give up if all names seems to be in use. */
1169                         err = -ENOSPC;
1170                         unix_release_addr(addr);
1171                         goto out;
1172                 }
1173
1174                 goto retry;
1175         }
1176
1177         __unix_set_addr_hash(net, sk, addr, new_hash);
1178         unix_table_double_unlock(net, old_hash, new_hash);
1179         err = 0;
1180
1181 out:    mutex_unlock(&u->bindlock);
1182         return err;
1183 }
1184
1185 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1186                          int addr_len)
1187 {
1188         umode_t mode = S_IFSOCK |
1189                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1190         unsigned int new_hash, old_hash = sk->sk_hash;
1191         struct unix_sock *u = unix_sk(sk);
1192         struct net *net = sock_net(sk);
1193         struct user_namespace *ns; // barf...
1194         struct unix_address *addr;
1195         struct dentry *dentry;
1196         struct path parent;
1197         int err;
1198
1199         unix_mkname_bsd(sunaddr, addr_len);
1200         addr_len = strlen(sunaddr->sun_path) +
1201                 offsetof(struct sockaddr_un, sun_path) + 1;
1202
1203         addr = unix_create_addr(sunaddr, addr_len);
1204         if (!addr)
1205                 return -ENOMEM;
1206
1207         /*
1208          * Get the parent directory, calculate the hash for last
1209          * component.
1210          */
1211         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1212         if (IS_ERR(dentry)) {
1213                 err = PTR_ERR(dentry);
1214                 goto out;
1215         }
1216
1217         /*
1218          * All right, let's create it.
1219          */
1220         ns = mnt_user_ns(parent.mnt);
1221         err = security_path_mknod(&parent, dentry, mode, 0);
1222         if (!err)
1223                 err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1224         if (err)
1225                 goto out_path;
1226         err = mutex_lock_interruptible(&u->bindlock);
1227         if (err)
1228                 goto out_unlink;
1229         if (u->addr)
1230                 goto out_unlock;
1231
1232         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1233         unix_table_double_lock(net, old_hash, new_hash);
1234         u->path.mnt = mntget(parent.mnt);
1235         u->path.dentry = dget(dentry);
1236         __unix_set_addr_hash(net, sk, addr, new_hash);
1237         unix_table_double_unlock(net, old_hash, new_hash);
1238         unix_insert_bsd_socket(sk);
1239         mutex_unlock(&u->bindlock);
1240         done_path_create(&parent, dentry);
1241         return 0;
1242
1243 out_unlock:
1244         mutex_unlock(&u->bindlock);
1245         err = -EINVAL;
1246 out_unlink:
1247         /* failed after successful mknod?  unlink what we'd created... */
1248         vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1249 out_path:
1250         done_path_create(&parent, dentry);
1251 out:
1252         unix_release_addr(addr);
1253         return err == -EEXIST ? -EADDRINUSE : err;
1254 }
1255
1256 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1257                               int addr_len)
1258 {
1259         unsigned int new_hash, old_hash = sk->sk_hash;
1260         struct unix_sock *u = unix_sk(sk);
1261         struct net *net = sock_net(sk);
1262         struct unix_address *addr;
1263         int err;
1264
1265         addr = unix_create_addr(sunaddr, addr_len);
1266         if (!addr)
1267                 return -ENOMEM;
1268
1269         err = mutex_lock_interruptible(&u->bindlock);
1270         if (err)
1271                 goto out;
1272
1273         if (u->addr) {
1274                 err = -EINVAL;
1275                 goto out_mutex;
1276         }
1277
1278         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1279         unix_table_double_lock(net, old_hash, new_hash);
1280
1281         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1282                 goto out_spin;
1283
1284         __unix_set_addr_hash(net, sk, addr, new_hash);
1285         unix_table_double_unlock(net, old_hash, new_hash);
1286         mutex_unlock(&u->bindlock);
1287         return 0;
1288
1289 out_spin:
1290         unix_table_double_unlock(net, old_hash, new_hash);
1291         err = -EADDRINUSE;
1292 out_mutex:
1293         mutex_unlock(&u->bindlock);
1294 out:
1295         unix_release_addr(addr);
1296         return err;
1297 }
1298
1299 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1300 {
1301         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1302         struct sock *sk = sock->sk;
1303         int err;
1304
1305         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1306             sunaddr->sun_family == AF_UNIX)
1307                 return unix_autobind(sk);
1308
1309         err = unix_validate_addr(sunaddr, addr_len);
1310         if (err)
1311                 return err;
1312
1313         if (sunaddr->sun_path[0])
1314                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1315         else
1316                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1317
1318         return err;
1319 }
1320
1321 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1322 {
1323         if (unlikely(sk1 == sk2) || !sk2) {
1324                 unix_state_lock(sk1);
1325                 return;
1326         }
1327         if (sk1 < sk2) {
1328                 unix_state_lock(sk1);
1329                 unix_state_lock_nested(sk2);
1330         } else {
1331                 unix_state_lock(sk2);
1332                 unix_state_lock_nested(sk1);
1333         }
1334 }
1335
1336 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1337 {
1338         if (unlikely(sk1 == sk2) || !sk2) {
1339                 unix_state_unlock(sk1);
1340                 return;
1341         }
1342         unix_state_unlock(sk1);
1343         unix_state_unlock(sk2);
1344 }
1345
1346 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1347                               int alen, int flags)
1348 {
1349         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1350         struct sock *sk = sock->sk;
1351         struct sock *other;
1352         int err;
1353
1354         err = -EINVAL;
1355         if (alen < offsetofend(struct sockaddr, sa_family))
1356                 goto out;
1357
1358         if (addr->sa_family != AF_UNSPEC) {
1359                 err = unix_validate_addr(sunaddr, alen);
1360                 if (err)
1361                         goto out;
1362
1363                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1364                     !unix_sk(sk)->addr) {
1365                         err = unix_autobind(sk);
1366                         if (err)
1367                                 goto out;
1368                 }
1369
1370 restart:
1371                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1372                 if (IS_ERR(other)) {
1373                         err = PTR_ERR(other);
1374                         goto out;
1375                 }
1376
1377                 unix_state_double_lock(sk, other);
1378
1379                 /* Apparently VFS overslept socket death. Retry. */
1380                 if (sock_flag(other, SOCK_DEAD)) {
1381                         unix_state_double_unlock(sk, other);
1382                         sock_put(other);
1383                         goto restart;
1384                 }
1385
1386                 err = -EPERM;
1387                 if (!unix_may_send(sk, other))
1388                         goto out_unlock;
1389
1390                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1391                 if (err)
1392                         goto out_unlock;
1393
1394                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1395         } else {
1396                 /*
1397                  *      1003.1g breaking connected state with AF_UNSPEC
1398                  */
1399                 other = NULL;
1400                 unix_state_double_lock(sk, other);
1401         }
1402
1403         /*
1404          * If it was connected, reconnect.
1405          */
1406         if (unix_peer(sk)) {
1407                 struct sock *old_peer = unix_peer(sk);
1408
1409                 unix_peer(sk) = other;
1410                 if (!other)
1411                         sk->sk_state = TCP_CLOSE;
1412                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1413
1414                 unix_state_double_unlock(sk, other);
1415
1416                 if (other != old_peer)
1417                         unix_dgram_disconnected(sk, old_peer);
1418                 sock_put(old_peer);
1419         } else {
1420                 unix_peer(sk) = other;
1421                 unix_state_double_unlock(sk, other);
1422         }
1423
1424         return 0;
1425
1426 out_unlock:
1427         unix_state_double_unlock(sk, other);
1428         sock_put(other);
1429 out:
1430         return err;
1431 }
1432
1433 static long unix_wait_for_peer(struct sock *other, long timeo)
1434         __releases(&unix_sk(other)->lock)
1435 {
1436         struct unix_sock *u = unix_sk(other);
1437         int sched;
1438         DEFINE_WAIT(wait);
1439
1440         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1441
1442         sched = !sock_flag(other, SOCK_DEAD) &&
1443                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1444                 unix_recvq_full(other);
1445
1446         unix_state_unlock(other);
1447
1448         if (sched)
1449                 timeo = schedule_timeout(timeo);
1450
1451         finish_wait(&u->peer_wait, &wait);
1452         return timeo;
1453 }
1454
1455 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1456                                int addr_len, int flags)
1457 {
1458         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1459         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1460         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1461         struct net *net = sock_net(sk);
1462         struct sk_buff *skb = NULL;
1463         long timeo;
1464         int err;
1465         int st;
1466
1467         err = unix_validate_addr(sunaddr, addr_len);
1468         if (err)
1469                 goto out;
1470
1471         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1472                 err = unix_autobind(sk);
1473                 if (err)
1474                         goto out;
1475         }
1476
1477         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1478
1479         /* First of all allocate resources.
1480            If we will make it after state is locked,
1481            we will have to recheck all again in any case.
1482          */
1483
1484         /* create new sock for complete connection */
1485         newsk = unix_create1(net, NULL, 0, sock->type);
1486         if (IS_ERR(newsk)) {
1487                 err = PTR_ERR(newsk);
1488                 newsk = NULL;
1489                 goto out;
1490         }
1491
1492         err = -ENOMEM;
1493
1494         /* Allocate skb for sending to listening sock */
1495         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1496         if (skb == NULL)
1497                 goto out;
1498
1499 restart:
1500         /*  Find listening sock. */
1501         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1502         if (IS_ERR(other)) {
1503                 err = PTR_ERR(other);
1504                 other = NULL;
1505                 goto out;
1506         }
1507
1508         /* Latch state of peer */
1509         unix_state_lock(other);
1510
1511         /* Apparently VFS overslept socket death. Retry. */
1512         if (sock_flag(other, SOCK_DEAD)) {
1513                 unix_state_unlock(other);
1514                 sock_put(other);
1515                 goto restart;
1516         }
1517
1518         err = -ECONNREFUSED;
1519         if (other->sk_state != TCP_LISTEN)
1520                 goto out_unlock;
1521         if (other->sk_shutdown & RCV_SHUTDOWN)
1522                 goto out_unlock;
1523
1524         if (unix_recvq_full(other)) {
1525                 err = -EAGAIN;
1526                 if (!timeo)
1527                         goto out_unlock;
1528
1529                 timeo = unix_wait_for_peer(other, timeo);
1530
1531                 err = sock_intr_errno(timeo);
1532                 if (signal_pending(current))
1533                         goto out;
1534                 sock_put(other);
1535                 goto restart;
1536         }
1537
1538         /* Latch our state.
1539
1540            It is tricky place. We need to grab our state lock and cannot
1541            drop lock on peer. It is dangerous because deadlock is
1542            possible. Connect to self case and simultaneous
1543            attempt to connect are eliminated by checking socket
1544            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1545            check this before attempt to grab lock.
1546
1547            Well, and we have to recheck the state after socket locked.
1548          */
1549         st = sk->sk_state;
1550
1551         switch (st) {
1552         case TCP_CLOSE:
1553                 /* This is ok... continue with connect */
1554                 break;
1555         case TCP_ESTABLISHED:
1556                 /* Socket is already connected */
1557                 err = -EISCONN;
1558                 goto out_unlock;
1559         default:
1560                 err = -EINVAL;
1561                 goto out_unlock;
1562         }
1563
1564         unix_state_lock_nested(sk);
1565
1566         if (sk->sk_state != st) {
1567                 unix_state_unlock(sk);
1568                 unix_state_unlock(other);
1569                 sock_put(other);
1570                 goto restart;
1571         }
1572
1573         err = security_unix_stream_connect(sk, other, newsk);
1574         if (err) {
1575                 unix_state_unlock(sk);
1576                 goto out_unlock;
1577         }
1578
1579         /* The way is open! Fastly set all the necessary fields... */
1580
1581         sock_hold(sk);
1582         unix_peer(newsk)        = sk;
1583         newsk->sk_state         = TCP_ESTABLISHED;
1584         newsk->sk_type          = sk->sk_type;
1585         init_peercred(newsk);
1586         newu = unix_sk(newsk);
1587         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1588         otheru = unix_sk(other);
1589
1590         /* copy address information from listening to new sock
1591          *
1592          * The contents of *(otheru->addr) and otheru->path
1593          * are seen fully set up here, since we have found
1594          * otheru in hash under its lock.  Insertion into the
1595          * hash chain we'd found it in had been done in an
1596          * earlier critical area protected by the chain's lock,
1597          * the same one where we'd set *(otheru->addr) contents,
1598          * as well as otheru->path and otheru->addr itself.
1599          *
1600          * Using smp_store_release() here to set newu->addr
1601          * is enough to make those stores, as well as stores
1602          * to newu->path visible to anyone who gets newu->addr
1603          * by smp_load_acquire().  IOW, the same warranties
1604          * as for unix_sock instances bound in unix_bind() or
1605          * in unix_autobind().
1606          */
1607         if (otheru->path.dentry) {
1608                 path_get(&otheru->path);
1609                 newu->path = otheru->path;
1610         }
1611         refcount_inc(&otheru->addr->refcnt);
1612         smp_store_release(&newu->addr, otheru->addr);
1613
1614         /* Set credentials */
1615         copy_peercred(sk, other);
1616
1617         sock->state     = SS_CONNECTED;
1618         sk->sk_state    = TCP_ESTABLISHED;
1619         sock_hold(newsk);
1620
1621         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1622         unix_peer(sk)   = newsk;
1623
1624         unix_state_unlock(sk);
1625
1626         /* take ten and send info to listening sock */
1627         spin_lock(&other->sk_receive_queue.lock);
1628         __skb_queue_tail(&other->sk_receive_queue, skb);
1629         spin_unlock(&other->sk_receive_queue.lock);
1630         unix_state_unlock(other);
1631         other->sk_data_ready(other);
1632         sock_put(other);
1633         return 0;
1634
1635 out_unlock:
1636         if (other)
1637                 unix_state_unlock(other);
1638
1639 out:
1640         kfree_skb(skb);
1641         if (newsk)
1642                 unix_release_sock(newsk, 0);
1643         if (other)
1644                 sock_put(other);
1645         return err;
1646 }
1647
1648 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1649 {
1650         struct sock *ska = socka->sk, *skb = sockb->sk;
1651
1652         /* Join our sockets back to back */
1653         sock_hold(ska);
1654         sock_hold(skb);
1655         unix_peer(ska) = skb;
1656         unix_peer(skb) = ska;
1657         init_peercred(ska);
1658         init_peercred(skb);
1659
1660         ska->sk_state = TCP_ESTABLISHED;
1661         skb->sk_state = TCP_ESTABLISHED;
1662         socka->state  = SS_CONNECTED;
1663         sockb->state  = SS_CONNECTED;
1664         return 0;
1665 }
1666
1667 static void unix_sock_inherit_flags(const struct socket *old,
1668                                     struct socket *new)
1669 {
1670         if (test_bit(SOCK_PASSCRED, &old->flags))
1671                 set_bit(SOCK_PASSCRED, &new->flags);
1672         if (test_bit(SOCK_PASSSEC, &old->flags))
1673                 set_bit(SOCK_PASSSEC, &new->flags);
1674 }
1675
1676 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1677                        bool kern)
1678 {
1679         struct sock *sk = sock->sk;
1680         struct sock *tsk;
1681         struct sk_buff *skb;
1682         int err;
1683
1684         err = -EOPNOTSUPP;
1685         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1686                 goto out;
1687
1688         err = -EINVAL;
1689         if (sk->sk_state != TCP_LISTEN)
1690                 goto out;
1691
1692         /* If socket state is TCP_LISTEN it cannot change (for now...),
1693          * so that no locks are necessary.
1694          */
1695
1696         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1697                                 &err);
1698         if (!skb) {
1699                 /* This means receive shutdown. */
1700                 if (err == 0)
1701                         err = -EINVAL;
1702                 goto out;
1703         }
1704
1705         tsk = skb->sk;
1706         skb_free_datagram(sk, skb);
1707         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1708
1709         /* attach accepted sock to socket */
1710         unix_state_lock(tsk);
1711         newsock->state = SS_CONNECTED;
1712         unix_sock_inherit_flags(sock, newsock);
1713         sock_graft(tsk, newsock);
1714         unix_state_unlock(tsk);
1715         return 0;
1716
1717 out:
1718         return err;
1719 }
1720
1721
1722 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1723 {
1724         struct sock *sk = sock->sk;
1725         struct unix_address *addr;
1726         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1727         int err = 0;
1728
1729         if (peer) {
1730                 sk = unix_peer_get(sk);
1731
1732                 err = -ENOTCONN;
1733                 if (!sk)
1734                         goto out;
1735                 err = 0;
1736         } else {
1737                 sock_hold(sk);
1738         }
1739
1740         addr = smp_load_acquire(&unix_sk(sk)->addr);
1741         if (!addr) {
1742                 sunaddr->sun_family = AF_UNIX;
1743                 sunaddr->sun_path[0] = 0;
1744                 err = offsetof(struct sockaddr_un, sun_path);
1745         } else {
1746                 err = addr->len;
1747                 memcpy(sunaddr, addr->name, addr->len);
1748         }
1749         sock_put(sk);
1750 out:
1751         return err;
1752 }
1753
1754 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1755 {
1756         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1757
1758         /*
1759          * Garbage collection of unix sockets starts by selecting a set of
1760          * candidate sockets which have reference only from being in flight
1761          * (total_refs == inflight_refs).  This condition is checked once during
1762          * the candidate collection phase, and candidates are marked as such, so
1763          * that non-candidates can later be ignored.  While inflight_refs is
1764          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1765          * is an instantaneous decision.
1766          *
1767          * Once a candidate, however, the socket must not be reinstalled into a
1768          * file descriptor while the garbage collection is in progress.
1769          *
1770          * If the above conditions are met, then the directed graph of
1771          * candidates (*) does not change while unix_gc_lock is held.
1772          *
1773          * Any operations that changes the file count through file descriptors
1774          * (dup, close, sendmsg) does not change the graph since candidates are
1775          * not installed in fds.
1776          *
1777          * Dequeing a candidate via recvmsg would install it into an fd, but
1778          * that takes unix_gc_lock to decrement the inflight count, so it's
1779          * serialized with garbage collection.
1780          *
1781          * MSG_PEEK is special in that it does not change the inflight count,
1782          * yet does install the socket into an fd.  The following lock/unlock
1783          * pair is to ensure serialization with garbage collection.  It must be
1784          * done between incrementing the file count and installing the file into
1785          * an fd.
1786          *
1787          * If garbage collection starts after the barrier provided by the
1788          * lock/unlock, then it will see the elevated refcount and not mark this
1789          * as a candidate.  If a garbage collection is already in progress
1790          * before the file count was incremented, then the lock/unlock pair will
1791          * ensure that garbage collection is finished before progressing to
1792          * installing the fd.
1793          *
1794          * (*) A -> B where B is on the queue of A or B is on the queue of C
1795          * which is on the queue of listening socket A.
1796          */
1797         spin_lock(&unix_gc_lock);
1798         spin_unlock(&unix_gc_lock);
1799 }
1800
1801 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1802 {
1803         int err = 0;
1804
1805         UNIXCB(skb).pid  = get_pid(scm->pid);
1806         UNIXCB(skb).uid = scm->creds.uid;
1807         UNIXCB(skb).gid = scm->creds.gid;
1808         UNIXCB(skb).fp = NULL;
1809         unix_get_secdata(scm, skb);
1810         if (scm->fp && send_fds)
1811                 err = unix_attach_fds(scm, skb);
1812
1813         skb->destructor = unix_destruct_scm;
1814         return err;
1815 }
1816
1817 static bool unix_passcred_enabled(const struct socket *sock,
1818                                   const struct sock *other)
1819 {
1820         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1821                !other->sk_socket ||
1822                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1823 }
1824
1825 /*
1826  * Some apps rely on write() giving SCM_CREDENTIALS
1827  * We include credentials if source or destination socket
1828  * asserted SOCK_PASSCRED.
1829  */
1830 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1831                             const struct sock *other)
1832 {
1833         if (UNIXCB(skb).pid)
1834                 return;
1835         if (unix_passcred_enabled(sock, other)) {
1836                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1837                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1838         }
1839 }
1840
1841 static int maybe_init_creds(struct scm_cookie *scm,
1842                             struct socket *socket,
1843                             const struct sock *other)
1844 {
1845         int err;
1846         struct msghdr msg = { .msg_controllen = 0 };
1847
1848         err = scm_send(socket, &msg, scm, false);
1849         if (err)
1850                 return err;
1851
1852         if (unix_passcred_enabled(socket, other)) {
1853                 scm->pid = get_pid(task_tgid(current));
1854                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1855         }
1856         return err;
1857 }
1858
1859 static bool unix_skb_scm_eq(struct sk_buff *skb,
1860                             struct scm_cookie *scm)
1861 {
1862         return UNIXCB(skb).pid == scm->pid &&
1863                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1864                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1865                unix_secdata_eq(scm, skb);
1866 }
1867
1868 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1869 {
1870         struct scm_fp_list *fp = UNIXCB(skb).fp;
1871         struct unix_sock *u = unix_sk(sk);
1872
1873         if (unlikely(fp && fp->count))
1874                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1875 }
1876
1877 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1878 {
1879         struct scm_fp_list *fp = UNIXCB(skb).fp;
1880         struct unix_sock *u = unix_sk(sk);
1881
1882         if (unlikely(fp && fp->count))
1883                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1884 }
1885
1886 /*
1887  *      Send AF_UNIX data.
1888  */
1889
1890 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1891                               size_t len)
1892 {
1893         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1894         struct sock *sk = sock->sk, *other = NULL;
1895         struct unix_sock *u = unix_sk(sk);
1896         struct scm_cookie scm;
1897         struct sk_buff *skb;
1898         int data_len = 0;
1899         int sk_locked;
1900         long timeo;
1901         int err;
1902
1903         wait_for_unix_gc();
1904         err = scm_send(sock, msg, &scm, false);
1905         if (err < 0)
1906                 return err;
1907
1908         err = -EOPNOTSUPP;
1909         if (msg->msg_flags&MSG_OOB)
1910                 goto out;
1911
1912         if (msg->msg_namelen) {
1913                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1914                 if (err)
1915                         goto out;
1916         } else {
1917                 sunaddr = NULL;
1918                 err = -ENOTCONN;
1919                 other = unix_peer_get(sk);
1920                 if (!other)
1921                         goto out;
1922         }
1923
1924         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1925                 err = unix_autobind(sk);
1926                 if (err)
1927                         goto out;
1928         }
1929
1930         err = -EMSGSIZE;
1931         if (len > sk->sk_sndbuf - 32)
1932                 goto out;
1933
1934         if (len > SKB_MAX_ALLOC) {
1935                 data_len = min_t(size_t,
1936                                  len - SKB_MAX_ALLOC,
1937                                  MAX_SKB_FRAGS * PAGE_SIZE);
1938                 data_len = PAGE_ALIGN(data_len);
1939
1940                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1941         }
1942
1943         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1944                                    msg->msg_flags & MSG_DONTWAIT, &err,
1945                                    PAGE_ALLOC_COSTLY_ORDER);
1946         if (skb == NULL)
1947                 goto out;
1948
1949         err = unix_scm_to_skb(&scm, skb, true);
1950         if (err < 0)
1951                 goto out_free;
1952
1953         skb_put(skb, len - data_len);
1954         skb->data_len = data_len;
1955         skb->len = len;
1956         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1957         if (err)
1958                 goto out_free;
1959
1960         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1961
1962 restart:
1963         if (!other) {
1964                 err = -ECONNRESET;
1965                 if (sunaddr == NULL)
1966                         goto out_free;
1967
1968                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1969                                         sk->sk_type);
1970                 if (IS_ERR(other)) {
1971                         err = PTR_ERR(other);
1972                         other = NULL;
1973                         goto out_free;
1974                 }
1975         }
1976
1977         if (sk_filter(other, skb) < 0) {
1978                 /* Toss the packet but do not return any error to the sender */
1979                 err = len;
1980                 goto out_free;
1981         }
1982
1983         sk_locked = 0;
1984         unix_state_lock(other);
1985 restart_locked:
1986         err = -EPERM;
1987         if (!unix_may_send(sk, other))
1988                 goto out_unlock;
1989
1990         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1991                 /*
1992                  *      Check with 1003.1g - what should
1993                  *      datagram error
1994                  */
1995                 unix_state_unlock(other);
1996                 sock_put(other);
1997
1998                 if (!sk_locked)
1999                         unix_state_lock(sk);
2000
2001                 err = 0;
2002                 if (unix_peer(sk) == other) {
2003                         unix_peer(sk) = NULL;
2004                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2005
2006                         unix_state_unlock(sk);
2007
2008                         sk->sk_state = TCP_CLOSE;
2009                         unix_dgram_disconnected(sk, other);
2010                         sock_put(other);
2011                         err = -ECONNREFUSED;
2012                 } else {
2013                         unix_state_unlock(sk);
2014                 }
2015
2016                 other = NULL;
2017                 if (err)
2018                         goto out_free;
2019                 goto restart;
2020         }
2021
2022         err = -EPIPE;
2023         if (other->sk_shutdown & RCV_SHUTDOWN)
2024                 goto out_unlock;
2025
2026         if (sk->sk_type != SOCK_SEQPACKET) {
2027                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2028                 if (err)
2029                         goto out_unlock;
2030         }
2031
2032         /* other == sk && unix_peer(other) != sk if
2033          * - unix_peer(sk) == NULL, destination address bound to sk
2034          * - unix_peer(sk) == sk by time of get but disconnected before lock
2035          */
2036         if (other != sk &&
2037             unlikely(unix_peer(other) != sk &&
2038             unix_recvq_full_lockless(other))) {
2039                 if (timeo) {
2040                         timeo = unix_wait_for_peer(other, timeo);
2041
2042                         err = sock_intr_errno(timeo);
2043                         if (signal_pending(current))
2044                                 goto out_free;
2045
2046                         goto restart;
2047                 }
2048
2049                 if (!sk_locked) {
2050                         unix_state_unlock(other);
2051                         unix_state_double_lock(sk, other);
2052                 }
2053
2054                 if (unix_peer(sk) != other ||
2055                     unix_dgram_peer_wake_me(sk, other)) {
2056                         err = -EAGAIN;
2057                         sk_locked = 1;
2058                         goto out_unlock;
2059                 }
2060
2061                 if (!sk_locked) {
2062                         sk_locked = 1;
2063                         goto restart_locked;
2064                 }
2065         }
2066
2067         if (unlikely(sk_locked))
2068                 unix_state_unlock(sk);
2069
2070         if (sock_flag(other, SOCK_RCVTSTAMP))
2071                 __net_timestamp(skb);
2072         maybe_add_creds(skb, sock, other);
2073         scm_stat_add(other, skb);
2074         skb_queue_tail(&other->sk_receive_queue, skb);
2075         unix_state_unlock(other);
2076         other->sk_data_ready(other);
2077         sock_put(other);
2078         scm_destroy(&scm);
2079         return len;
2080
2081 out_unlock:
2082         if (sk_locked)
2083                 unix_state_unlock(sk);
2084         unix_state_unlock(other);
2085 out_free:
2086         kfree_skb(skb);
2087 out:
2088         if (other)
2089                 sock_put(other);
2090         scm_destroy(&scm);
2091         return err;
2092 }
2093
2094 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2095  * bytes, and a minimum of a full page.
2096  */
2097 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2098
2099 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2100 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2101 {
2102         struct unix_sock *ousk = unix_sk(other);
2103         struct sk_buff *skb;
2104         int err = 0;
2105
2106         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2107
2108         if (!skb)
2109                 return err;
2110
2111         skb_put(skb, 1);
2112         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2113
2114         if (err) {
2115                 kfree_skb(skb);
2116                 return err;
2117         }
2118
2119         unix_state_lock(other);
2120
2121         if (sock_flag(other, SOCK_DEAD) ||
2122             (other->sk_shutdown & RCV_SHUTDOWN)) {
2123                 unix_state_unlock(other);
2124                 kfree_skb(skb);
2125                 return -EPIPE;
2126         }
2127
2128         maybe_add_creds(skb, sock, other);
2129         skb_get(skb);
2130
2131         if (ousk->oob_skb)
2132                 consume_skb(ousk->oob_skb);
2133
2134         WRITE_ONCE(ousk->oob_skb, skb);
2135
2136         scm_stat_add(other, skb);
2137         skb_queue_tail(&other->sk_receive_queue, skb);
2138         sk_send_sigurg(other);
2139         unix_state_unlock(other);
2140         other->sk_data_ready(other);
2141
2142         return err;
2143 }
2144 #endif
2145
2146 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2147                                size_t len)
2148 {
2149         struct sock *sk = sock->sk;
2150         struct sock *other = NULL;
2151         int err, size;
2152         struct sk_buff *skb;
2153         int sent = 0;
2154         struct scm_cookie scm;
2155         bool fds_sent = false;
2156         int data_len;
2157
2158         wait_for_unix_gc();
2159         err = scm_send(sock, msg, &scm, false);
2160         if (err < 0)
2161                 return err;
2162
2163         err = -EOPNOTSUPP;
2164         if (msg->msg_flags & MSG_OOB) {
2165 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2166                 if (len)
2167                         len--;
2168                 else
2169 #endif
2170                         goto out_err;
2171         }
2172
2173         if (msg->msg_namelen) {
2174                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2175                 goto out_err;
2176         } else {
2177                 err = -ENOTCONN;
2178                 other = unix_peer(sk);
2179                 if (!other)
2180                         goto out_err;
2181         }
2182
2183         if (sk->sk_shutdown & SEND_SHUTDOWN)
2184                 goto pipe_err;
2185
2186         while (sent < len) {
2187                 size = len - sent;
2188
2189                 /* Keep two messages in the pipe so it schedules better */
2190                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2191
2192                 /* allow fallback to order-0 allocations */
2193                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2194
2195                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2196
2197                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2198
2199                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2200                                            msg->msg_flags & MSG_DONTWAIT, &err,
2201                                            get_order(UNIX_SKB_FRAGS_SZ));
2202                 if (!skb)
2203                         goto out_err;
2204
2205                 /* Only send the fds in the first buffer */
2206                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2207                 if (err < 0) {
2208                         kfree_skb(skb);
2209                         goto out_err;
2210                 }
2211                 fds_sent = true;
2212
2213                 skb_put(skb, size - data_len);
2214                 skb->data_len = data_len;
2215                 skb->len = size;
2216                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2217                 if (err) {
2218                         kfree_skb(skb);
2219                         goto out_err;
2220                 }
2221
2222                 unix_state_lock(other);
2223
2224                 if (sock_flag(other, SOCK_DEAD) ||
2225                     (other->sk_shutdown & RCV_SHUTDOWN))
2226                         goto pipe_err_free;
2227
2228                 maybe_add_creds(skb, sock, other);
2229                 scm_stat_add(other, skb);
2230                 skb_queue_tail(&other->sk_receive_queue, skb);
2231                 unix_state_unlock(other);
2232                 other->sk_data_ready(other);
2233                 sent += size;
2234         }
2235
2236 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2237         if (msg->msg_flags & MSG_OOB) {
2238                 err = queue_oob(sock, msg, other);
2239                 if (err)
2240                         goto out_err;
2241                 sent++;
2242         }
2243 #endif
2244
2245         scm_destroy(&scm);
2246
2247         return sent;
2248
2249 pipe_err_free:
2250         unix_state_unlock(other);
2251         kfree_skb(skb);
2252 pipe_err:
2253         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2254                 send_sig(SIGPIPE, current, 0);
2255         err = -EPIPE;
2256 out_err:
2257         scm_destroy(&scm);
2258         return sent ? : err;
2259 }
2260
2261 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2262                                     int offset, size_t size, int flags)
2263 {
2264         int err;
2265         bool send_sigpipe = false;
2266         bool init_scm = true;
2267         struct scm_cookie scm;
2268         struct sock *other, *sk = socket->sk;
2269         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2270
2271         if (flags & MSG_OOB)
2272                 return -EOPNOTSUPP;
2273
2274         other = unix_peer(sk);
2275         if (!other || sk->sk_state != TCP_ESTABLISHED)
2276                 return -ENOTCONN;
2277
2278         if (false) {
2279 alloc_skb:
2280                 unix_state_unlock(other);
2281                 mutex_unlock(&unix_sk(other)->iolock);
2282                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2283                                               &err, 0);
2284                 if (!newskb)
2285                         goto err;
2286         }
2287
2288         /* we must acquire iolock as we modify already present
2289          * skbs in the sk_receive_queue and mess with skb->len
2290          */
2291         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2292         if (err) {
2293                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2294                 goto err;
2295         }
2296
2297         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2298                 err = -EPIPE;
2299                 send_sigpipe = true;
2300                 goto err_unlock;
2301         }
2302
2303         unix_state_lock(other);
2304
2305         if (sock_flag(other, SOCK_DEAD) ||
2306             other->sk_shutdown & RCV_SHUTDOWN) {
2307                 err = -EPIPE;
2308                 send_sigpipe = true;
2309                 goto err_state_unlock;
2310         }
2311
2312         if (init_scm) {
2313                 err = maybe_init_creds(&scm, socket, other);
2314                 if (err)
2315                         goto err_state_unlock;
2316                 init_scm = false;
2317         }
2318
2319         skb = skb_peek_tail(&other->sk_receive_queue);
2320         if (tail && tail == skb) {
2321                 skb = newskb;
2322         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2323                 if (newskb) {
2324                         skb = newskb;
2325                 } else {
2326                         tail = skb;
2327                         goto alloc_skb;
2328                 }
2329         } else if (newskb) {
2330                 /* this is fast path, we don't necessarily need to
2331                  * call to kfree_skb even though with newskb == NULL
2332                  * this - does no harm
2333                  */
2334                 consume_skb(newskb);
2335                 newskb = NULL;
2336         }
2337
2338         if (skb_append_pagefrags(skb, page, offset, size)) {
2339                 tail = skb;
2340                 goto alloc_skb;
2341         }
2342
2343         skb->len += size;
2344         skb->data_len += size;
2345         skb->truesize += size;
2346         refcount_add(size, &sk->sk_wmem_alloc);
2347
2348         if (newskb) {
2349                 err = unix_scm_to_skb(&scm, skb, false);
2350                 if (err)
2351                         goto err_state_unlock;
2352                 spin_lock(&other->sk_receive_queue.lock);
2353                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2354                 spin_unlock(&other->sk_receive_queue.lock);
2355         }
2356
2357         unix_state_unlock(other);
2358         mutex_unlock(&unix_sk(other)->iolock);
2359
2360         other->sk_data_ready(other);
2361         scm_destroy(&scm);
2362         return size;
2363
2364 err_state_unlock:
2365         unix_state_unlock(other);
2366 err_unlock:
2367         mutex_unlock(&unix_sk(other)->iolock);
2368 err:
2369         kfree_skb(newskb);
2370         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2371                 send_sig(SIGPIPE, current, 0);
2372         if (!init_scm)
2373                 scm_destroy(&scm);
2374         return err;
2375 }
2376
2377 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2378                                   size_t len)
2379 {
2380         int err;
2381         struct sock *sk = sock->sk;
2382
2383         err = sock_error(sk);
2384         if (err)
2385                 return err;
2386
2387         if (sk->sk_state != TCP_ESTABLISHED)
2388                 return -ENOTCONN;
2389
2390         if (msg->msg_namelen)
2391                 msg->msg_namelen = 0;
2392
2393         return unix_dgram_sendmsg(sock, msg, len);
2394 }
2395
2396 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2397                                   size_t size, int flags)
2398 {
2399         struct sock *sk = sock->sk;
2400
2401         if (sk->sk_state != TCP_ESTABLISHED)
2402                 return -ENOTCONN;
2403
2404         return unix_dgram_recvmsg(sock, msg, size, flags);
2405 }
2406
2407 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2408 {
2409         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2410
2411         if (addr) {
2412                 msg->msg_namelen = addr->len;
2413                 memcpy(msg->msg_name, addr->name, addr->len);
2414         }
2415 }
2416
2417 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2418                          int flags)
2419 {
2420         struct scm_cookie scm;
2421         struct socket *sock = sk->sk_socket;
2422         struct unix_sock *u = unix_sk(sk);
2423         struct sk_buff *skb, *last;
2424         long timeo;
2425         int skip;
2426         int err;
2427
2428         err = -EOPNOTSUPP;
2429         if (flags&MSG_OOB)
2430                 goto out;
2431
2432         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2433
2434         do {
2435                 mutex_lock(&u->iolock);
2436
2437                 skip = sk_peek_offset(sk, flags);
2438                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2439                                               &skip, &err, &last);
2440                 if (skb) {
2441                         if (!(flags & MSG_PEEK))
2442                                 scm_stat_del(sk, skb);
2443                         break;
2444                 }
2445
2446                 mutex_unlock(&u->iolock);
2447
2448                 if (err != -EAGAIN)
2449                         break;
2450         } while (timeo &&
2451                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2452                                               &err, &timeo, last));
2453
2454         if (!skb) { /* implies iolock unlocked */
2455                 unix_state_lock(sk);
2456                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2457                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2458                     (sk->sk_shutdown & RCV_SHUTDOWN))
2459                         err = 0;
2460                 unix_state_unlock(sk);
2461                 goto out;
2462         }
2463
2464         if (wq_has_sleeper(&u->peer_wait))
2465                 wake_up_interruptible_sync_poll(&u->peer_wait,
2466                                                 EPOLLOUT | EPOLLWRNORM |
2467                                                 EPOLLWRBAND);
2468
2469         if (msg->msg_name)
2470                 unix_copy_addr(msg, skb->sk);
2471
2472         if (size > skb->len - skip)
2473                 size = skb->len - skip;
2474         else if (size < skb->len - skip)
2475                 msg->msg_flags |= MSG_TRUNC;
2476
2477         err = skb_copy_datagram_msg(skb, skip, msg, size);
2478         if (err)
2479                 goto out_free;
2480
2481         if (sock_flag(sk, SOCK_RCVTSTAMP))
2482                 __sock_recv_timestamp(msg, sk, skb);
2483
2484         memset(&scm, 0, sizeof(scm));
2485
2486         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2487         unix_set_secdata(&scm, skb);
2488
2489         if (!(flags & MSG_PEEK)) {
2490                 if (UNIXCB(skb).fp)
2491                         unix_detach_fds(&scm, skb);
2492
2493                 sk_peek_offset_bwd(sk, skb->len);
2494         } else {
2495                 /* It is questionable: on PEEK we could:
2496                    - do not return fds - good, but too simple 8)
2497                    - return fds, and do not return them on read (old strategy,
2498                      apparently wrong)
2499                    - clone fds (I chose it for now, it is the most universal
2500                      solution)
2501
2502                    POSIX 1003.1g does not actually define this clearly
2503                    at all. POSIX 1003.1g doesn't define a lot of things
2504                    clearly however!
2505
2506                 */
2507
2508                 sk_peek_offset_fwd(sk, size);
2509
2510                 if (UNIXCB(skb).fp)
2511                         unix_peek_fds(&scm, skb);
2512         }
2513         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2514
2515         scm_recv(sock, msg, &scm, flags);
2516
2517 out_free:
2518         skb_free_datagram(sk, skb);
2519         mutex_unlock(&u->iolock);
2520 out:
2521         return err;
2522 }
2523
2524 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2525                               int flags)
2526 {
2527         struct sock *sk = sock->sk;
2528
2529 #ifdef CONFIG_BPF_SYSCALL
2530         const struct proto *prot = READ_ONCE(sk->sk_prot);
2531
2532         if (prot != &unix_dgram_proto)
2533                 return prot->recvmsg(sk, msg, size, flags, NULL);
2534 #endif
2535         return __unix_dgram_recvmsg(sk, msg, size, flags);
2536 }
2537
2538 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2539 {
2540         struct unix_sock *u = unix_sk(sk);
2541         struct sk_buff *skb;
2542         int err, copied;
2543
2544         mutex_lock(&u->iolock);
2545         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2546         mutex_unlock(&u->iolock);
2547         if (!skb)
2548                 return err;
2549
2550         copied = recv_actor(sk, skb);
2551         kfree_skb(skb);
2552
2553         return copied;
2554 }
2555
2556 /*
2557  *      Sleep until more data has arrived. But check for races..
2558  */
2559 static long unix_stream_data_wait(struct sock *sk, long timeo,
2560                                   struct sk_buff *last, unsigned int last_len,
2561                                   bool freezable)
2562 {
2563         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2564         struct sk_buff *tail;
2565         DEFINE_WAIT(wait);
2566
2567         unix_state_lock(sk);
2568
2569         for (;;) {
2570                 prepare_to_wait(sk_sleep(sk), &wait, state);
2571
2572                 tail = skb_peek_tail(&sk->sk_receive_queue);
2573                 if (tail != last ||
2574                     (tail && tail->len != last_len) ||
2575                     sk->sk_err ||
2576                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2577                     signal_pending(current) ||
2578                     !timeo)
2579                         break;
2580
2581                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2582                 unix_state_unlock(sk);
2583                 timeo = schedule_timeout(timeo);
2584                 unix_state_lock(sk);
2585
2586                 if (sock_flag(sk, SOCK_DEAD))
2587                         break;
2588
2589                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2590         }
2591
2592         finish_wait(sk_sleep(sk), &wait);
2593         unix_state_unlock(sk);
2594         return timeo;
2595 }
2596
2597 static unsigned int unix_skb_len(const struct sk_buff *skb)
2598 {
2599         return skb->len - UNIXCB(skb).consumed;
2600 }
2601
2602 struct unix_stream_read_state {
2603         int (*recv_actor)(struct sk_buff *, int, int,
2604                           struct unix_stream_read_state *);
2605         struct socket *socket;
2606         struct msghdr *msg;
2607         struct pipe_inode_info *pipe;
2608         size_t size;
2609         int flags;
2610         unsigned int splice_flags;
2611 };
2612
2613 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2614 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2615 {
2616         struct socket *sock = state->socket;
2617         struct sock *sk = sock->sk;
2618         struct unix_sock *u = unix_sk(sk);
2619         int chunk = 1;
2620         struct sk_buff *oob_skb;
2621
2622         mutex_lock(&u->iolock);
2623         unix_state_lock(sk);
2624
2625         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2626                 unix_state_unlock(sk);
2627                 mutex_unlock(&u->iolock);
2628                 return -EINVAL;
2629         }
2630
2631         oob_skb = u->oob_skb;
2632
2633         if (!(state->flags & MSG_PEEK))
2634                 WRITE_ONCE(u->oob_skb, NULL);
2635
2636         unix_state_unlock(sk);
2637
2638         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2639
2640         if (!(state->flags & MSG_PEEK)) {
2641                 UNIXCB(oob_skb).consumed += 1;
2642                 kfree_skb(oob_skb);
2643         }
2644
2645         mutex_unlock(&u->iolock);
2646
2647         if (chunk < 0)
2648                 return -EFAULT;
2649
2650         state->msg->msg_flags |= MSG_OOB;
2651         return 1;
2652 }
2653
2654 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2655                                   int flags, int copied)
2656 {
2657         struct unix_sock *u = unix_sk(sk);
2658
2659         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2660                 skb_unlink(skb, &sk->sk_receive_queue);
2661                 consume_skb(skb);
2662                 skb = NULL;
2663         } else {
2664                 if (skb == u->oob_skb) {
2665                         if (copied) {
2666                                 skb = NULL;
2667                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2668                                 if (!(flags & MSG_PEEK)) {
2669                                         WRITE_ONCE(u->oob_skb, NULL);
2670                                         consume_skb(skb);
2671                                 }
2672                         } else if (!(flags & MSG_PEEK)) {
2673                                 skb_unlink(skb, &sk->sk_receive_queue);
2674                                 consume_skb(skb);
2675                                 skb = skb_peek(&sk->sk_receive_queue);
2676                         }
2677                 }
2678         }
2679         return skb;
2680 }
2681 #endif
2682
2683 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2684 {
2685         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2686                 return -ENOTCONN;
2687
2688         return unix_read_skb(sk, recv_actor);
2689 }
2690
2691 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2692                                     bool freezable)
2693 {
2694         struct scm_cookie scm;
2695         struct socket *sock = state->socket;
2696         struct sock *sk = sock->sk;
2697         struct unix_sock *u = unix_sk(sk);
2698         int copied = 0;
2699         int flags = state->flags;
2700         int noblock = flags & MSG_DONTWAIT;
2701         bool check_creds = false;
2702         int target;
2703         int err = 0;
2704         long timeo;
2705         int skip;
2706         size_t size = state->size;
2707         unsigned int last_len;
2708
2709         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2710                 err = -EINVAL;
2711                 goto out;
2712         }
2713
2714         if (unlikely(flags & MSG_OOB)) {
2715                 err = -EOPNOTSUPP;
2716 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2717                 err = unix_stream_recv_urg(state);
2718 #endif
2719                 goto out;
2720         }
2721
2722         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2723         timeo = sock_rcvtimeo(sk, noblock);
2724
2725         memset(&scm, 0, sizeof(scm));
2726
2727         /* Lock the socket to prevent queue disordering
2728          * while sleeps in memcpy_tomsg
2729          */
2730         mutex_lock(&u->iolock);
2731
2732         skip = max(sk_peek_offset(sk, flags), 0);
2733
2734         do {
2735                 int chunk;
2736                 bool drop_skb;
2737                 struct sk_buff *skb, *last;
2738
2739 redo:
2740                 unix_state_lock(sk);
2741                 if (sock_flag(sk, SOCK_DEAD)) {
2742                         err = -ECONNRESET;
2743                         goto unlock;
2744                 }
2745                 last = skb = skb_peek(&sk->sk_receive_queue);
2746                 last_len = last ? last->len : 0;
2747
2748 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2749                 if (skb) {
2750                         skb = manage_oob(skb, sk, flags, copied);
2751                         if (!skb) {
2752                                 unix_state_unlock(sk);
2753                                 if (copied)
2754                                         break;
2755                                 goto redo;
2756                         }
2757                 }
2758 #endif
2759 again:
2760                 if (skb == NULL) {
2761                         if (copied >= target)
2762                                 goto unlock;
2763
2764                         /*
2765                          *      POSIX 1003.1g mandates this order.
2766                          */
2767
2768                         err = sock_error(sk);
2769                         if (err)
2770                                 goto unlock;
2771                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2772                                 goto unlock;
2773
2774                         unix_state_unlock(sk);
2775                         if (!timeo) {
2776                                 err = -EAGAIN;
2777                                 break;
2778                         }
2779
2780                         mutex_unlock(&u->iolock);
2781
2782                         timeo = unix_stream_data_wait(sk, timeo, last,
2783                                                       last_len, freezable);
2784
2785                         if (signal_pending(current)) {
2786                                 err = sock_intr_errno(timeo);
2787                                 scm_destroy(&scm);
2788                                 goto out;
2789                         }
2790
2791                         mutex_lock(&u->iolock);
2792                         goto redo;
2793 unlock:
2794                         unix_state_unlock(sk);
2795                         break;
2796                 }
2797
2798                 while (skip >= unix_skb_len(skb)) {
2799                         skip -= unix_skb_len(skb);
2800                         last = skb;
2801                         last_len = skb->len;
2802                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2803                         if (!skb)
2804                                 goto again;
2805                 }
2806
2807                 unix_state_unlock(sk);
2808
2809                 if (check_creds) {
2810                         /* Never glue messages from different writers */
2811                         if (!unix_skb_scm_eq(skb, &scm))
2812                                 break;
2813                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2814                         /* Copy credentials */
2815                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2816                         unix_set_secdata(&scm, skb);
2817                         check_creds = true;
2818                 }
2819
2820                 /* Copy address just once */
2821                 if (state->msg && state->msg->msg_name) {
2822                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2823                                          state->msg->msg_name);
2824                         unix_copy_addr(state->msg, skb->sk);
2825                         sunaddr = NULL;
2826                 }
2827
2828                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2829                 skb_get(skb);
2830                 chunk = state->recv_actor(skb, skip, chunk, state);
2831                 drop_skb = !unix_skb_len(skb);
2832                 /* skb is only safe to use if !drop_skb */
2833                 consume_skb(skb);
2834                 if (chunk < 0) {
2835                         if (copied == 0)
2836                                 copied = -EFAULT;
2837                         break;
2838                 }
2839                 copied += chunk;
2840                 size -= chunk;
2841
2842                 if (drop_skb) {
2843                         /* the skb was touched by a concurrent reader;
2844                          * we should not expect anything from this skb
2845                          * anymore and assume it invalid - we can be
2846                          * sure it was dropped from the socket queue
2847                          *
2848                          * let's report a short read
2849                          */
2850                         err = 0;
2851                         break;
2852                 }
2853
2854                 /* Mark read part of skb as used */
2855                 if (!(flags & MSG_PEEK)) {
2856                         UNIXCB(skb).consumed += chunk;
2857
2858                         sk_peek_offset_bwd(sk, chunk);
2859
2860                         if (UNIXCB(skb).fp) {
2861                                 scm_stat_del(sk, skb);
2862                                 unix_detach_fds(&scm, skb);
2863                         }
2864
2865                         if (unix_skb_len(skb))
2866                                 break;
2867
2868                         skb_unlink(skb, &sk->sk_receive_queue);
2869                         consume_skb(skb);
2870
2871                         if (scm.fp)
2872                                 break;
2873                 } else {
2874                         /* It is questionable, see note in unix_dgram_recvmsg.
2875                          */
2876                         if (UNIXCB(skb).fp)
2877                                 unix_peek_fds(&scm, skb);
2878
2879                         sk_peek_offset_fwd(sk, chunk);
2880
2881                         if (UNIXCB(skb).fp)
2882                                 break;
2883
2884                         skip = 0;
2885                         last = skb;
2886                         last_len = skb->len;
2887                         unix_state_lock(sk);
2888                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2889                         if (skb)
2890                                 goto again;
2891                         unix_state_unlock(sk);
2892                         break;
2893                 }
2894         } while (size);
2895
2896         mutex_unlock(&u->iolock);
2897         if (state->msg)
2898                 scm_recv(sock, state->msg, &scm, flags);
2899         else
2900                 scm_destroy(&scm);
2901 out:
2902         return copied ? : err;
2903 }
2904
2905 static int unix_stream_read_actor(struct sk_buff *skb,
2906                                   int skip, int chunk,
2907                                   struct unix_stream_read_state *state)
2908 {
2909         int ret;
2910
2911         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2912                                     state->msg, chunk);
2913         return ret ?: chunk;
2914 }
2915
2916 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2917                           size_t size, int flags)
2918 {
2919         struct unix_stream_read_state state = {
2920                 .recv_actor = unix_stream_read_actor,
2921                 .socket = sk->sk_socket,
2922                 .msg = msg,
2923                 .size = size,
2924                 .flags = flags
2925         };
2926
2927         return unix_stream_read_generic(&state, true);
2928 }
2929
2930 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2931                                size_t size, int flags)
2932 {
2933         struct unix_stream_read_state state = {
2934                 .recv_actor = unix_stream_read_actor,
2935                 .socket = sock,
2936                 .msg = msg,
2937                 .size = size,
2938                 .flags = flags
2939         };
2940
2941 #ifdef CONFIG_BPF_SYSCALL
2942         struct sock *sk = sock->sk;
2943         const struct proto *prot = READ_ONCE(sk->sk_prot);
2944
2945         if (prot != &unix_stream_proto)
2946                 return prot->recvmsg(sk, msg, size, flags, NULL);
2947 #endif
2948         return unix_stream_read_generic(&state, true);
2949 }
2950
2951 static int unix_stream_splice_actor(struct sk_buff *skb,
2952                                     int skip, int chunk,
2953                                     struct unix_stream_read_state *state)
2954 {
2955         return skb_splice_bits(skb, state->socket->sk,
2956                                UNIXCB(skb).consumed + skip,
2957                                state->pipe, chunk, state->splice_flags);
2958 }
2959
2960 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2961                                        struct pipe_inode_info *pipe,
2962                                        size_t size, unsigned int flags)
2963 {
2964         struct unix_stream_read_state state = {
2965                 .recv_actor = unix_stream_splice_actor,
2966                 .socket = sock,
2967                 .pipe = pipe,
2968                 .size = size,
2969                 .splice_flags = flags,
2970         };
2971
2972         if (unlikely(*ppos))
2973                 return -ESPIPE;
2974
2975         if (sock->file->f_flags & O_NONBLOCK ||
2976             flags & SPLICE_F_NONBLOCK)
2977                 state.flags = MSG_DONTWAIT;
2978
2979         return unix_stream_read_generic(&state, false);
2980 }
2981
2982 static int unix_shutdown(struct socket *sock, int mode)
2983 {
2984         struct sock *sk = sock->sk;
2985         struct sock *other;
2986
2987         if (mode < SHUT_RD || mode > SHUT_RDWR)
2988                 return -EINVAL;
2989         /* This maps:
2990          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2991          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2992          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2993          */
2994         ++mode;
2995
2996         unix_state_lock(sk);
2997         sk->sk_shutdown |= mode;
2998         other = unix_peer(sk);
2999         if (other)
3000                 sock_hold(other);
3001         unix_state_unlock(sk);
3002         sk->sk_state_change(sk);
3003
3004         if (other &&
3005                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3006
3007                 int peer_mode = 0;
3008                 const struct proto *prot = READ_ONCE(other->sk_prot);
3009
3010                 if (prot->unhash)
3011                         prot->unhash(other);
3012                 if (mode&RCV_SHUTDOWN)
3013                         peer_mode |= SEND_SHUTDOWN;
3014                 if (mode&SEND_SHUTDOWN)
3015                         peer_mode |= RCV_SHUTDOWN;
3016                 unix_state_lock(other);
3017                 other->sk_shutdown |= peer_mode;
3018                 unix_state_unlock(other);
3019                 other->sk_state_change(other);
3020                 if (peer_mode == SHUTDOWN_MASK)
3021                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3022                 else if (peer_mode & RCV_SHUTDOWN)
3023                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3024         }
3025         if (other)
3026                 sock_put(other);
3027
3028         return 0;
3029 }
3030
3031 long unix_inq_len(struct sock *sk)
3032 {
3033         struct sk_buff *skb;
3034         long amount = 0;
3035
3036         if (sk->sk_state == TCP_LISTEN)
3037                 return -EINVAL;
3038
3039         spin_lock(&sk->sk_receive_queue.lock);
3040         if (sk->sk_type == SOCK_STREAM ||
3041             sk->sk_type == SOCK_SEQPACKET) {
3042                 skb_queue_walk(&sk->sk_receive_queue, skb)
3043                         amount += unix_skb_len(skb);
3044         } else {
3045                 skb = skb_peek(&sk->sk_receive_queue);
3046                 if (skb)
3047                         amount = skb->len;
3048         }
3049         spin_unlock(&sk->sk_receive_queue.lock);
3050
3051         return amount;
3052 }
3053 EXPORT_SYMBOL_GPL(unix_inq_len);
3054
3055 long unix_outq_len(struct sock *sk)
3056 {
3057         return sk_wmem_alloc_get(sk);
3058 }
3059 EXPORT_SYMBOL_GPL(unix_outq_len);
3060
3061 static int unix_open_file(struct sock *sk)
3062 {
3063         struct path path;
3064         struct file *f;
3065         int fd;
3066
3067         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3068                 return -EPERM;
3069
3070         if (!smp_load_acquire(&unix_sk(sk)->addr))
3071                 return -ENOENT;
3072
3073         path = unix_sk(sk)->path;
3074         if (!path.dentry)
3075                 return -ENOENT;
3076
3077         path_get(&path);
3078
3079         fd = get_unused_fd_flags(O_CLOEXEC);
3080         if (fd < 0)
3081                 goto out;
3082
3083         f = dentry_open(&path, O_PATH, current_cred());
3084         if (IS_ERR(f)) {
3085                 put_unused_fd(fd);
3086                 fd = PTR_ERR(f);
3087                 goto out;
3088         }
3089
3090         fd_install(fd, f);
3091 out:
3092         path_put(&path);
3093
3094         return fd;
3095 }
3096
3097 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3098 {
3099         struct sock *sk = sock->sk;
3100         long amount = 0;
3101         int err;
3102
3103         switch (cmd) {
3104         case SIOCOUTQ:
3105                 amount = unix_outq_len(sk);
3106                 err = put_user(amount, (int __user *)arg);
3107                 break;
3108         case SIOCINQ:
3109                 amount = unix_inq_len(sk);
3110                 if (amount < 0)
3111                         err = amount;
3112                 else
3113                         err = put_user(amount, (int __user *)arg);
3114                 break;
3115         case SIOCUNIXFILE:
3116                 err = unix_open_file(sk);
3117                 break;
3118 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3119         case SIOCATMARK:
3120                 {
3121                         struct sk_buff *skb;
3122                         int answ = 0;
3123
3124                         skb = skb_peek(&sk->sk_receive_queue);
3125                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3126                                 answ = 1;
3127                         err = put_user(answ, (int __user *)arg);
3128                 }
3129                 break;
3130 #endif
3131         default:
3132                 err = -ENOIOCTLCMD;
3133                 break;
3134         }
3135         return err;
3136 }
3137
3138 #ifdef CONFIG_COMPAT
3139 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3140 {
3141         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3142 }
3143 #endif
3144
3145 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3146 {
3147         struct sock *sk = sock->sk;
3148         __poll_t mask;
3149
3150         sock_poll_wait(file, sock, wait);
3151         mask = 0;
3152
3153         /* exceptional events? */
3154         if (sk->sk_err)
3155                 mask |= EPOLLERR;
3156         if (sk->sk_shutdown == SHUTDOWN_MASK)
3157                 mask |= EPOLLHUP;
3158         if (sk->sk_shutdown & RCV_SHUTDOWN)
3159                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3160
3161         /* readable? */
3162         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3163                 mask |= EPOLLIN | EPOLLRDNORM;
3164         if (sk_is_readable(sk))
3165                 mask |= EPOLLIN | EPOLLRDNORM;
3166 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3167         if (READ_ONCE(unix_sk(sk)->oob_skb))
3168                 mask |= EPOLLPRI;
3169 #endif
3170
3171         /* Connection-based need to check for termination and startup */
3172         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3173             sk->sk_state == TCP_CLOSE)
3174                 mask |= EPOLLHUP;
3175
3176         /*
3177          * we set writable also when the other side has shut down the
3178          * connection. This prevents stuck sockets.
3179          */
3180         if (unix_writable(sk))
3181                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3182
3183         return mask;
3184 }
3185
3186 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3187                                     poll_table *wait)
3188 {
3189         struct sock *sk = sock->sk, *other;
3190         unsigned int writable;
3191         __poll_t mask;
3192
3193         sock_poll_wait(file, sock, wait);
3194         mask = 0;
3195
3196         /* exceptional events? */
3197         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3198                 mask |= EPOLLERR |
3199                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3200
3201         if (sk->sk_shutdown & RCV_SHUTDOWN)
3202                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3203         if (sk->sk_shutdown == SHUTDOWN_MASK)
3204                 mask |= EPOLLHUP;
3205
3206         /* readable? */
3207         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3208                 mask |= EPOLLIN | EPOLLRDNORM;
3209         if (sk_is_readable(sk))
3210                 mask |= EPOLLIN | EPOLLRDNORM;
3211
3212         /* Connection-based need to check for termination and startup */
3213         if (sk->sk_type == SOCK_SEQPACKET) {
3214                 if (sk->sk_state == TCP_CLOSE)
3215                         mask |= EPOLLHUP;
3216                 /* connection hasn't started yet? */
3217                 if (sk->sk_state == TCP_SYN_SENT)
3218                         return mask;
3219         }
3220
3221         /* No write status requested, avoid expensive OUT tests. */
3222         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3223                 return mask;
3224
3225         writable = unix_writable(sk);
3226         if (writable) {
3227                 unix_state_lock(sk);
3228
3229                 other = unix_peer(sk);
3230                 if (other && unix_peer(other) != sk &&
3231                     unix_recvq_full_lockless(other) &&
3232                     unix_dgram_peer_wake_me(sk, other))
3233                         writable = 0;
3234
3235                 unix_state_unlock(sk);
3236         }
3237
3238         if (writable)
3239                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3240         else
3241                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3242
3243         return mask;
3244 }
3245
3246 #ifdef CONFIG_PROC_FS
3247
3248 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3249
3250 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3251 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3252 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3253
3254 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3255 {
3256         unsigned long offset = get_offset(*pos);
3257         unsigned long bucket = get_bucket(*pos);
3258         unsigned long count = 0;
3259         struct sock *sk;
3260
3261         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3262              sk; sk = sk_next(sk)) {
3263                 if (++count == offset)
3264                         break;
3265         }
3266
3267         return sk;
3268 }
3269
3270 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3271 {
3272         unsigned long bucket = get_bucket(*pos);
3273         struct net *net = seq_file_net(seq);
3274         struct sock *sk;
3275
3276         while (bucket < UNIX_HASH_SIZE) {
3277                 spin_lock(&net->unx.table.locks[bucket]);
3278
3279                 sk = unix_from_bucket(seq, pos);
3280                 if (sk)
3281                         return sk;
3282
3283                 spin_unlock(&net->unx.table.locks[bucket]);
3284
3285                 *pos = set_bucket_offset(++bucket, 1);
3286         }
3287
3288         return NULL;
3289 }
3290
3291 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3292                                   loff_t *pos)
3293 {
3294         unsigned long bucket = get_bucket(*pos);
3295
3296         sk = sk_next(sk);
3297         if (sk)
3298                 return sk;
3299
3300
3301         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3302
3303         *pos = set_bucket_offset(++bucket, 1);
3304
3305         return unix_get_first(seq, pos);
3306 }
3307
3308 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3309 {
3310         if (!*pos)
3311                 return SEQ_START_TOKEN;
3312
3313         return unix_get_first(seq, pos);
3314 }
3315
3316 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3317 {
3318         ++*pos;
3319
3320         if (v == SEQ_START_TOKEN)
3321                 return unix_get_first(seq, pos);
3322
3323         return unix_get_next(seq, v, pos);
3324 }
3325
3326 static void unix_seq_stop(struct seq_file *seq, void *v)
3327 {
3328         struct sock *sk = v;
3329
3330         if (sk)
3331                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3332 }
3333
3334 static int unix_seq_show(struct seq_file *seq, void *v)
3335 {
3336
3337         if (v == SEQ_START_TOKEN)
3338                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3339                          "Inode Path\n");
3340         else {
3341                 struct sock *s = v;
3342                 struct unix_sock *u = unix_sk(s);
3343                 unix_state_lock(s);
3344
3345                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3346                         s,
3347                         refcount_read(&s->sk_refcnt),
3348                         0,
3349                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3350                         s->sk_type,
3351                         s->sk_socket ?
3352                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3353                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3354                         sock_i_ino(s));
3355
3356                 if (u->addr) {  // under a hash table lock here
3357                         int i, len;
3358                         seq_putc(seq, ' ');
3359
3360                         i = 0;
3361                         len = u->addr->len -
3362                                 offsetof(struct sockaddr_un, sun_path);
3363                         if (u->addr->name->sun_path[0]) {
3364                                 len--;
3365                         } else {
3366                                 seq_putc(seq, '@');
3367                                 i++;
3368                         }
3369                         for ( ; i < len; i++)
3370                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3371                                          '@');
3372                 }
3373                 unix_state_unlock(s);
3374                 seq_putc(seq, '\n');
3375         }
3376
3377         return 0;
3378 }
3379
3380 static const struct seq_operations unix_seq_ops = {
3381         .start  = unix_seq_start,
3382         .next   = unix_seq_next,
3383         .stop   = unix_seq_stop,
3384         .show   = unix_seq_show,
3385 };
3386
3387 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3388 struct bpf_unix_iter_state {
3389         struct seq_net_private p;
3390         unsigned int cur_sk;
3391         unsigned int end_sk;
3392         unsigned int max_sk;
3393         struct sock **batch;
3394         bool st_bucket_done;
3395 };
3396
3397 struct bpf_iter__unix {
3398         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3399         __bpf_md_ptr(struct unix_sock *, unix_sk);
3400         uid_t uid __aligned(8);
3401 };
3402
3403 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3404                               struct unix_sock *unix_sk, uid_t uid)
3405 {
3406         struct bpf_iter__unix ctx;
3407
3408         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3409         ctx.meta = meta;
3410         ctx.unix_sk = unix_sk;
3411         ctx.uid = uid;
3412         return bpf_iter_run_prog(prog, &ctx);
3413 }
3414
3415 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3416
3417 {
3418         struct bpf_unix_iter_state *iter = seq->private;
3419         unsigned int expected = 1;
3420         struct sock *sk;
3421
3422         sock_hold(start_sk);
3423         iter->batch[iter->end_sk++] = start_sk;
3424
3425         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3426                 if (iter->end_sk < iter->max_sk) {
3427                         sock_hold(sk);
3428                         iter->batch[iter->end_sk++] = sk;
3429                 }
3430
3431                 expected++;
3432         }
3433
3434         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3435
3436         return expected;
3437 }
3438
3439 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3440 {
3441         while (iter->cur_sk < iter->end_sk)
3442                 sock_put(iter->batch[iter->cur_sk++]);
3443 }
3444
3445 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3446                                        unsigned int new_batch_sz)
3447 {
3448         struct sock **new_batch;
3449
3450         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3451                              GFP_USER | __GFP_NOWARN);
3452         if (!new_batch)
3453                 return -ENOMEM;
3454
3455         bpf_iter_unix_put_batch(iter);
3456         kvfree(iter->batch);
3457         iter->batch = new_batch;
3458         iter->max_sk = new_batch_sz;
3459
3460         return 0;
3461 }
3462
3463 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3464                                         loff_t *pos)
3465 {
3466         struct bpf_unix_iter_state *iter = seq->private;
3467         unsigned int expected;
3468         bool resized = false;
3469         struct sock *sk;
3470
3471         if (iter->st_bucket_done)
3472                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3473
3474 again:
3475         /* Get a new batch */
3476         iter->cur_sk = 0;
3477         iter->end_sk = 0;
3478
3479         sk = unix_get_first(seq, pos);
3480         if (!sk)
3481                 return NULL; /* Done */
3482
3483         expected = bpf_iter_unix_hold_batch(seq, sk);
3484
3485         if (iter->end_sk == expected) {
3486                 iter->st_bucket_done = true;
3487                 return sk;
3488         }
3489
3490         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3491                 resized = true;
3492                 goto again;
3493         }
3494
3495         return sk;
3496 }
3497
3498 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3499 {
3500         if (!*pos)
3501                 return SEQ_START_TOKEN;
3502
3503         /* bpf iter does not support lseek, so it always
3504          * continue from where it was stop()-ped.
3505          */
3506         return bpf_iter_unix_batch(seq, pos);
3507 }
3508
3509 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3510 {
3511         struct bpf_unix_iter_state *iter = seq->private;
3512         struct sock *sk;
3513
3514         /* Whenever seq_next() is called, the iter->cur_sk is
3515          * done with seq_show(), so advance to the next sk in
3516          * the batch.
3517          */
3518         if (iter->cur_sk < iter->end_sk)
3519                 sock_put(iter->batch[iter->cur_sk++]);
3520
3521         ++*pos;
3522
3523         if (iter->cur_sk < iter->end_sk)
3524                 sk = iter->batch[iter->cur_sk];
3525         else
3526                 sk = bpf_iter_unix_batch(seq, pos);
3527
3528         return sk;
3529 }
3530
3531 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3532 {
3533         struct bpf_iter_meta meta;
3534         struct bpf_prog *prog;
3535         struct sock *sk = v;
3536         uid_t uid;
3537         bool slow;
3538         int ret;
3539
3540         if (v == SEQ_START_TOKEN)
3541                 return 0;
3542
3543         slow = lock_sock_fast(sk);
3544
3545         if (unlikely(sk_unhashed(sk))) {
3546                 ret = SEQ_SKIP;
3547                 goto unlock;
3548         }
3549
3550         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3551         meta.seq = seq;
3552         prog = bpf_iter_get_info(&meta, false);
3553         ret = unix_prog_seq_show(prog, &meta, v, uid);
3554 unlock:
3555         unlock_sock_fast(sk, slow);
3556         return ret;
3557 }
3558
3559 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3560 {
3561         struct bpf_unix_iter_state *iter = seq->private;
3562         struct bpf_iter_meta meta;
3563         struct bpf_prog *prog;
3564
3565         if (!v) {
3566                 meta.seq = seq;
3567                 prog = bpf_iter_get_info(&meta, true);
3568                 if (prog)
3569                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3570         }
3571
3572         if (iter->cur_sk < iter->end_sk)
3573                 bpf_iter_unix_put_batch(iter);
3574 }
3575
3576 static const struct seq_operations bpf_iter_unix_seq_ops = {
3577         .start  = bpf_iter_unix_seq_start,
3578         .next   = bpf_iter_unix_seq_next,
3579         .stop   = bpf_iter_unix_seq_stop,
3580         .show   = bpf_iter_unix_seq_show,
3581 };
3582 #endif
3583 #endif
3584
3585 static const struct net_proto_family unix_family_ops = {
3586         .family = PF_UNIX,
3587         .create = unix_create,
3588         .owner  = THIS_MODULE,
3589 };
3590
3591
3592 static int __net_init unix_net_init(struct net *net)
3593 {
3594         int i;
3595
3596         net->unx.sysctl_max_dgram_qlen = 10;
3597         if (unix_sysctl_register(net))
3598                 goto out;
3599
3600 #ifdef CONFIG_PROC_FS
3601         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3602                              sizeof(struct seq_net_private)))
3603                 goto err_sysctl;
3604 #endif
3605
3606         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3607                                               sizeof(spinlock_t), GFP_KERNEL);
3608         if (!net->unx.table.locks)
3609                 goto err_proc;
3610
3611         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3612                                                 sizeof(struct hlist_head),
3613                                                 GFP_KERNEL);
3614         if (!net->unx.table.buckets)
3615                 goto free_locks;
3616
3617         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3618                 spin_lock_init(&net->unx.table.locks[i]);
3619                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3620         }
3621
3622         return 0;
3623
3624 free_locks:
3625         kvfree(net->unx.table.locks);
3626 err_proc:
3627 #ifdef CONFIG_PROC_FS
3628         remove_proc_entry("unix", net->proc_net);
3629 err_sysctl:
3630 #endif
3631         unix_sysctl_unregister(net);
3632 out:
3633         return -ENOMEM;
3634 }
3635
3636 static void __net_exit unix_net_exit(struct net *net)
3637 {
3638         kvfree(net->unx.table.buckets);
3639         kvfree(net->unx.table.locks);
3640         unix_sysctl_unregister(net);
3641         remove_proc_entry("unix", net->proc_net);
3642 }
3643
3644 static struct pernet_operations unix_net_ops = {
3645         .init = unix_net_init,
3646         .exit = unix_net_exit,
3647 };
3648
3649 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3650 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3651                      struct unix_sock *unix_sk, uid_t uid)
3652
3653 #define INIT_BATCH_SZ 16
3654
3655 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3656 {
3657         struct bpf_unix_iter_state *iter = priv_data;
3658         int err;
3659
3660         err = bpf_iter_init_seq_net(priv_data, aux);
3661         if (err)
3662                 return err;
3663
3664         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3665         if (err) {
3666                 bpf_iter_fini_seq_net(priv_data);
3667                 return err;
3668         }
3669
3670         return 0;
3671 }
3672
3673 static void bpf_iter_fini_unix(void *priv_data)
3674 {
3675         struct bpf_unix_iter_state *iter = priv_data;
3676
3677         bpf_iter_fini_seq_net(priv_data);
3678         kvfree(iter->batch);
3679 }
3680
3681 static const struct bpf_iter_seq_info unix_seq_info = {
3682         .seq_ops                = &bpf_iter_unix_seq_ops,
3683         .init_seq_private       = bpf_iter_init_unix,
3684         .fini_seq_private       = bpf_iter_fini_unix,
3685         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3686 };
3687
3688 static const struct bpf_func_proto *
3689 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3690                              const struct bpf_prog *prog)
3691 {
3692         switch (func_id) {
3693         case BPF_FUNC_setsockopt:
3694                 return &bpf_sk_setsockopt_proto;
3695         case BPF_FUNC_getsockopt:
3696                 return &bpf_sk_getsockopt_proto;
3697         default:
3698                 return NULL;
3699         }
3700 }
3701
3702 static struct bpf_iter_reg unix_reg_info = {
3703         .target                 = "unix",
3704         .ctx_arg_info_size      = 1,
3705         .ctx_arg_info           = {
3706                 { offsetof(struct bpf_iter__unix, unix_sk),
3707                   PTR_TO_BTF_ID_OR_NULL },
3708         },
3709         .get_func_proto         = bpf_iter_unix_get_func_proto,
3710         .seq_info               = &unix_seq_info,
3711 };
3712
3713 static void __init bpf_iter_register(void)
3714 {
3715         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3716         if (bpf_iter_reg_target(&unix_reg_info))
3717                 pr_warn("Warning: could not register bpf iterator unix\n");
3718 }
3719 #endif
3720
3721 static int __init af_unix_init(void)
3722 {
3723         int i, rc = -1;
3724
3725         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3726
3727         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3728                 spin_lock_init(&bsd_socket_locks[i]);
3729                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3730         }
3731
3732         rc = proto_register(&unix_dgram_proto, 1);
3733         if (rc != 0) {
3734                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3735                 goto out;
3736         }
3737
3738         rc = proto_register(&unix_stream_proto, 1);
3739         if (rc != 0) {
3740                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3741                 goto out;
3742         }
3743
3744         sock_register(&unix_family_ops);
3745         register_pernet_subsys(&unix_net_ops);
3746         unix_bpf_build_proto();
3747
3748 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3749         bpf_iter_register();
3750 #endif
3751
3752 out:
3753         return rc;
3754 }
3755
3756 static void __exit af_unix_exit(void)
3757 {
3758         sock_unregister(PF_UNIX);
3759         proto_unregister(&unix_dgram_proto);
3760         proto_unregister(&unix_stream_proto);
3761         unregister_pernet_subsys(&unix_net_ops);
3762 }
3763
3764 /* Earlier than device_initcall() so that other drivers invoking
3765    request_module() don't end up in a loop when modprobe tries
3766    to use a UNIX socket. But later than subsys_initcall() because
3767    we depend on stuff initialised there */
3768 fs_initcall(af_unix_init);
3769 module_exit(af_unix_exit);
3770
3771 MODULE_LICENSE("GPL");
3772 MODULE_ALIAS_NETPROTO(PF_UNIX);