net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <[email protected]>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/termios.h>
  93 #include <linux/sockios.h>
  94 #include <linux/net.h>
  95 #include <linux/in.h>
  96 #include <linux/fs.h>
  97 #include <linux/slab.h>
  98 #include <linux/uaccess.h>
  99 #include <linux/skbuff.h>
 100 #include <linux/netdevice.h>
 101 #include <net/net_namespace.h>
 102 #include <net/sock.h>
 103 #include <net/tcp_states.h>
 104 #include <net/af_unix.h>
 105 #include <linux/proc_fs.h>
 106 #include <linux/seq_file.h>
 107 #include <net/scm.h>
 108 #include <linux/init.h>
 109 #include <linux/poll.h>
 110 #include <linux/rtnetlink.h>
 111 #include <linux/mount.h>
 112 #include <net/checksum.h>
 113 #include <linux/security.h>
 114 #include <linux/freezer.h>
 115 #include <linux/file.h>
 116
 117 #include "scm.h"
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         UNIXCB(skb).secid = scm->secid;
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = UNIXCB(skb).secid;
 147 }
 148
 149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 150 {
 151         return (scm->secid == UNIXCB(skb).secid);
 152 }
 153 #else
 154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 155 { }
 156
 157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 158 { }
 159
 160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 161 {
 162         return true;
 163 }
 164 #endif /* CONFIG_SECURITY_NETWORK */
 165
 166 /*
 167  *  SMP locking strategy:
 168  *    hash table is protected with spinlock unix_table_lock
 169  *    each socket state is protected by separate spin lock.
 170  */
 171
 172 static inline unsigned int unix_hash_fold(__wsum n)
 173 {
 174         unsigned int hash = (__force unsigned int)csum_fold(n);
 175
 176         hash ^= hash>>8;
 177         return hash&(UNIX_HASH_SIZE-1);
 178 }
 179
 180 #define unix_peer(sk) (unix_sk(sk)->peer)
 181
 182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 183 {
 184         return unix_peer(osk) == sk;
 185 }
 186
 187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 188 {
 189         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 190 }
 191
 192 static inline int unix_recvq_full(const struct sock *sk)
 193 {
 194         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 195 }
 196
 197 static inline int unix_recvq_full_lockless(const struct sock *sk)
 198 {
 199         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 200                 READ_ONCE(sk->sk_max_ack_backlog);
 201 }
 202
 203 struct sock *unix_peer_get(struct sock *s)
 204 {
 205         struct sock *peer;
 206
 207         unix_state_lock(s);
 208         peer = unix_peer(s);
 209         if (peer)
 210                 sock_hold(peer);
 211         unix_state_unlock(s);
 212         return peer;
 213 }
 214 EXPORT_SYMBOL_GPL(unix_peer_get);
 215
 216 static inline void unix_release_addr(struct unix_address *addr)
 217 {
 218         if (refcount_dec_and_test(&addr->refcnt))
 219                 kfree(addr);
 220 }
 221
 222 /*
 223  *      Check unix socket name:
 224  *              - should be not zero length.
 225  *              - if started by not zero, should be NULL terminated (FS object)
 226  *              - if started by zero, it is abstract name.
 227  */
 228
 229 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 230 {
 231         *hashp = 0;
 232
 233         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 234                 return -EINVAL;
 235         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 236                 return -EINVAL;
 237         if (sunaddr->sun_path[0]) {
 238                 /*
 239                  * This may look like an off by one error but it is a bit more
 240                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 241                  * sun_path[108] doesn't as such exist.  However in kernel space
 242                  * we are guaranteed that it is a valid memory location in our
 243                  * kernel address buffer.
 244                  */
 245                 ((char *)sunaddr)[len] = 0;
 246                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 247                 return len;
 248         }
 249
 250         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 251         return len;
 252 }
 253
 254 static void __unix_remove_socket(struct sock *sk)
 255 {
 256         sk_del_node_init(sk);
 257 }
 258
 259 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 260 {
 261         WARN_ON(!sk_unhashed(sk));
 262         sk_add_node(sk, list);
 263 }
 264
 265 static void __unix_set_addr(struct sock *sk, struct unix_address *addr,
 266                             unsigned hash)
 267 {
 268         __unix_remove_socket(sk);
 269         smp_store_release(&unix_sk(sk)->addr, addr);
 270         __unix_insert_socket(&unix_socket_table[hash], sk);
 271 }
 272
 273 static inline void unix_remove_socket(struct sock *sk)
 274 {
 275         spin_lock(&unix_table_lock);
 276         __unix_remove_socket(sk);
 277         spin_unlock(&unix_table_lock);
 278 }
 279
 280 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 281 {
 282         spin_lock(&unix_table_lock);
 283         __unix_insert_socket(list, sk);
 284         spin_unlock(&unix_table_lock);
 285 }
 286
 287 static struct sock *__unix_find_socket_byname(struct net *net,
 288                                               struct sockaddr_un *sunname,
 289                                               int len, unsigned int hash)
 290 {
 291         struct sock *s;
 292
 293         sk_for_each(s, &unix_socket_table[hash]) {
 294                 struct unix_sock *u = unix_sk(s);
 295
 296                 if (!net_eq(sock_net(s), net))
 297                         continue;
 298
 299                 if (u->addr->len == len &&
 300                     !memcmp(u->addr->name, sunname, len))
 301                         return s;
 302         }
 303         return NULL;
 304 }
 305
 306 static inline struct sock *unix_find_socket_byname(struct net *net,
 307                                                    struct sockaddr_un *sunname,
 308                                                    int len, unsigned int hash)
 309 {
 310         struct sock *s;
 311
 312         spin_lock(&unix_table_lock);
 313         s = __unix_find_socket_byname(net, sunname, len, hash);
 314         if (s)
 315                 sock_hold(s);
 316         spin_unlock(&unix_table_lock);
 317         return s;
 318 }
 319
 320 static struct sock *unix_find_socket_byinode(struct inode *i)
 321 {
 322         struct sock *s;
 323
 324         spin_lock(&unix_table_lock);
 325         sk_for_each(s,
 326                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 327                 struct dentry *dentry = unix_sk(s)->path.dentry;
 328
 329                 if (dentry && d_backing_inode(dentry) == i) {
 330                         sock_hold(s);
 331                         goto found;
 332                 }
 333         }
 334         s = NULL;
 335 found:
 336         spin_unlock(&unix_table_lock);
 337         return s;
 338 }
 339
 340 /* Support code for asymmetrically connected dgram sockets
 341  *
 342  * If a datagram socket is connected to a socket not itself connected
 343  * to the first socket (eg, /dev/log), clients may only enqueue more
 344  * messages if the present receive queue of the server socket is not
 345  * "too large". This means there's a second writeability condition
 346  * poll and sendmsg need to test. The dgram recv code will do a wake
 347  * up on the peer_wait wait queue of a socket upon reception of a
 348  * datagram which needs to be propagated to sleeping would-be writers
 349  * since these might not have sent anything so far. This can't be
 350  * accomplished via poll_wait because the lifetime of the server
 351  * socket might be less than that of its clients if these break their
 352  * association with it or if the server socket is closed while clients
 353  * are still connected to it and there's no way to inform "a polling
 354  * implementation" that it should let go of a certain wait queue
 355  *
 356  * In order to propagate a wake up, a wait_queue_entry_t of the client
 357  * socket is enqueued on the peer_wait queue of the server socket
 358  * whose wake function does a wake_up on the ordinary client socket
 359  * wait queue. This connection is established whenever a write (or
 360  * poll for write) hit the flow control condition and broken when the
 361  * association to the server socket is dissolved or after a wake up
 362  * was relayed.
 363  */
 364
 365 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 366                                       void *key)
 367 {
 368         struct unix_sock *u;
 369         wait_queue_head_t *u_sleep;
 370
 371         u = container_of(q, struct unix_sock, peer_wake);
 372
 373         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 374                             q);
 375         u->peer_wake.private = NULL;
 376
 377         /* relaying can only happen while the wq still exists */
 378         u_sleep = sk_sleep(&u->sk);
 379         if (u_sleep)
 380                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 381
 382         return 0;
 383 }
 384
 385 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 386 {
 387         struct unix_sock *u, *u_other;
 388         int rc;
 389
 390         u = unix_sk(sk);
 391         u_other = unix_sk(other);
 392         rc = 0;
 393         spin_lock(&u_other->peer_wait.lock);
 394
 395         if (!u->peer_wake.private) {
 396                 u->peer_wake.private = other;
 397                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 398
 399                 rc = 1;
 400         }
 401
 402         spin_unlock(&u_other->peer_wait.lock);
 403         return rc;
 404 }
 405
 406 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 407                                             struct sock *other)
 408 {
 409         struct unix_sock *u, *u_other;
 410
 411         u = unix_sk(sk);
 412         u_other = unix_sk(other);
 413         spin_lock(&u_other->peer_wait.lock);
 414
 415         if (u->peer_wake.private == other) {
 416                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 417                 u->peer_wake.private = NULL;
 418         }
 419
 420         spin_unlock(&u_other->peer_wait.lock);
 421 }
 422
 423 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 424                                                    struct sock *other)
 425 {
 426         unix_dgram_peer_wake_disconnect(sk, other);
 427         wake_up_interruptible_poll(sk_sleep(sk),
 428                                    EPOLLOUT |
 429                                    EPOLLWRNORM |
 430                                    EPOLLWRBAND);
 431 }
 432
 433 /* preconditions:
 434  *      - unix_peer(sk) == other
 435  *      - association is stable
 436  */
 437 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 438 {
 439         int connected;
 440
 441         connected = unix_dgram_peer_wake_connect(sk, other);
 442
 443         /* If other is SOCK_DEAD, we want to make sure we signal
 444          * POLLOUT, such that a subsequent write() can get a
 445          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 446          * to other and its full, we will hang waiting for POLLOUT.
 447          */
 448         if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
 449                 return 1;
 450
 451         if (connected)
 452                 unix_dgram_peer_wake_disconnect(sk, other);
 453
 454         return 0;
 455 }
 456
 457 static int unix_writable(const struct sock *sk)
 458 {
 459         return sk->sk_state != TCP_LISTEN &&
 460                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 461 }
 462
 463 static void unix_write_space(struct sock *sk)
 464 {
 465         struct socket_wq *wq;
 466
 467         rcu_read_lock();
 468         if (unix_writable(sk)) {
 469                 wq = rcu_dereference(sk->sk_wq);
 470                 if (skwq_has_sleeper(wq))
 471                         wake_up_interruptible_sync_poll(&wq->wait,
 472                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 473                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 474         }
 475         rcu_read_unlock();
 476 }
 477
 478 /* When dgram socket disconnects (or changes its peer), we clear its receive
 479  * queue of packets arrived from previous peer. First, it allows to do
 480  * flow control based only on wmem_alloc; second, sk connected to peer
 481  * may receive messages only from that peer. */
 482 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 483 {
 484         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 485                 skb_queue_purge(&sk->sk_receive_queue);
 486                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 487
 488                 /* If one link of bidirectional dgram pipe is disconnected,
 489                  * we signal error. Messages are lost. Do not make this,
 490                  * when peer was not connected to us.
 491                  */
 492                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 493                         other->sk_err = ECONNRESET;
 494                         sk_error_report(other);
 495                 }
 496         }
 497         sk->sk_state = other->sk_state = TCP_CLOSE;
 498 }
 499
 500 static void unix_sock_destructor(struct sock *sk)
 501 {
 502         struct unix_sock *u = unix_sk(sk);
 503
 504         skb_queue_purge(&sk->sk_receive_queue);
 505
 506 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 507         if (u->oob_skb) {
 508                 kfree_skb(u->oob_skb);
 509                 u->oob_skb = NULL;
 510         }
 511 #endif
 512         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 513         WARN_ON(!sk_unhashed(sk));
 514         WARN_ON(sk->sk_socket);
 515         if (!sock_flag(sk, SOCK_DEAD)) {
 516                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 517                 return;
 518         }
 519
 520         if (u->addr)
 521                 unix_release_addr(u->addr);
 522
 523         atomic_long_dec(&unix_nr_socks);
 524         local_bh_disable();
 525         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 526         local_bh_enable();
 527 #ifdef UNIX_REFCNT_DEBUG
 528         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 529                 atomic_long_read(&unix_nr_socks));
 530 #endif
 531 }
 532
 533 static void unix_release_sock(struct sock *sk, int embrion)
 534 {
 535         struct unix_sock *u = unix_sk(sk);
 536         struct path path;
 537         struct sock *skpair;
 538         struct sk_buff *skb;
 539         int state;
 540
 541         unix_remove_socket(sk);
 542
 543         /* Clear state */
 544         unix_state_lock(sk);
 545         sock_orphan(sk);
 546         sk->sk_shutdown = SHUTDOWN_MASK;
 547         path         = u->path;
 548         u->path.dentry = NULL;
 549         u->path.mnt = NULL;
 550         state = sk->sk_state;
 551         sk->sk_state = TCP_CLOSE;
 552
 553         skpair = unix_peer(sk);
 554         unix_peer(sk) = NULL;
 555
 556         unix_state_unlock(sk);
 557
 558         wake_up_interruptible_all(&u->peer_wait);
 559
 560         if (skpair != NULL) {
 561                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 562                         unix_state_lock(skpair);
 563                         /* No more writes */
 564                         skpair->sk_shutdown = SHUTDOWN_MASK;
 565                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 566                                 skpair->sk_err = ECONNRESET;
 567                         unix_state_unlock(skpair);
 568                         skpair->sk_state_change(skpair);
 569                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 570                 }
 571
 572                 unix_dgram_peer_wake_disconnect(sk, skpair);
 573                 sock_put(skpair); /* It may now die */
 574         }
 575
 576         /* Try to flush out this socket. Throw out buffers at least */
 577
 578         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 579                 if (state == TCP_LISTEN)
 580                         unix_release_sock(skb->sk, 1);
 581                 /* passed fds are erased in the kfree_skb hook        */
 582                 UNIXCB(skb).consumed = skb->len;
 583                 kfree_skb(skb);
 584         }
 585
 586         if (path.dentry)
 587                 path_put(&path);
 588
 589         sock_put(sk);
 590
 591         /* ---- Socket is dead now and most probably destroyed ---- */
 592
 593         /*
 594          * Fixme: BSD difference: In BSD all sockets connected to us get
 595          *        ECONNRESET and we die on the spot. In Linux we behave
 596          *        like files and pipes do and wait for the last
 597          *        dereference.
 598          *
 599          * Can't we simply set sock->err?
 600          *
 601          *        What the above comment does talk about? --ANK(980817)
 602          */
 603
 604         if (unix_tot_inflight)
 605                 unix_gc();              /* Garbage collect fds */
 606 }
 607
 608 static void init_peercred(struct sock *sk)
 609 {
 610         put_pid(sk->sk_peer_pid);
 611         if (sk->sk_peer_cred)
 612                 put_cred(sk->sk_peer_cred);
 613         sk->sk_peer_pid  = get_pid(task_tgid(current));
 614         sk->sk_peer_cred = get_current_cred();
 615 }
 616
 617 static void copy_peercred(struct sock *sk, struct sock *peersk)
 618 {
 619         put_pid(sk->sk_peer_pid);
 620         if (sk->sk_peer_cred)
 621                 put_cred(sk->sk_peer_cred);
 622         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 623         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 624 }
 625
 626 static int unix_listen(struct socket *sock, int backlog)
 627 {
 628         int err;
 629         struct sock *sk = sock->sk;
 630         struct unix_sock *u = unix_sk(sk);
 631
 632         err = -EOPNOTSUPP;
 633         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 634                 goto out;       /* Only stream/seqpacket sockets accept */
 635         err = -EINVAL;
 636         if (!u->addr)
 637                 goto out;       /* No listens on an unbound socket */
 638         unix_state_lock(sk);
 639         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 640                 goto out_unlock;
 641         if (backlog > sk->sk_max_ack_backlog)
 642                 wake_up_interruptible_all(&u->peer_wait);
 643         sk->sk_max_ack_backlog  = backlog;
 644         sk->sk_state            = TCP_LISTEN;
 645         /* set credentials so connect can copy them */
 646         init_peercred(sk);
 647         err = 0;
 648
 649 out_unlock:
 650         unix_state_unlock(sk);
 651 out:
 652         return err;
 653 }
 654
 655 static int unix_release(struct socket *);
 656 static int unix_bind(struct socket *, struct sockaddr *, int);
 657 static int unix_stream_connect(struct socket *, struct sockaddr *,
 658                                int addr_len, int flags);
 659 static int unix_socketpair(struct socket *, struct socket *);
 660 static int unix_accept(struct socket *, struct socket *, int, bool);
 661 static int unix_getname(struct socket *, struct sockaddr *, int);
 662 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 663 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 664                                     poll_table *);
 665 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 666 #ifdef CONFIG_COMPAT
 667 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 668 #endif
 669 static int unix_shutdown(struct socket *, int);
 670 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 671 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 672 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 673                                     size_t size, int flags);
 674 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 675                                        struct pipe_inode_info *, size_t size,
 676                                        unsigned int flags);
 677 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 678 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 679 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 680                           sk_read_actor_t recv_actor);
 681 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 682                               int, int);
 683 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 684 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 685                                   int);
 686
 687 static int unix_set_peek_off(struct sock *sk, int val)
 688 {
 689         struct unix_sock *u = unix_sk(sk);
 690
 691         if (mutex_lock_interruptible(&u->iolock))
 692                 return -EINTR;
 693
 694         sk->sk_peek_off = val;
 695         mutex_unlock(&u->iolock);
 696
 697         return 0;
 698 }
 699
 700 #ifdef CONFIG_PROC_FS
 701 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 702 {
 703         struct sock *sk = sock->sk;
 704         struct unix_sock *u;
 705
 706         if (sk) {
 707                 u = unix_sk(sock->sk);
 708                 seq_printf(m, "scm_fds: %u\n",
 709                            atomic_read(&u->scm_stat.nr_fds));
 710         }
 711 }
 712 #else
 713 #define unix_show_fdinfo NULL
 714 #endif
 715
 716 static const struct proto_ops unix_stream_ops = {
 717         .family =       PF_UNIX,
 718         .owner =        THIS_MODULE,
 719         .release =      unix_release,
 720         .bind =         unix_bind,
 721         .connect =      unix_stream_connect,
 722         .socketpair =   unix_socketpair,
 723         .accept =       unix_accept,
 724         .getname =      unix_getname,
 725         .poll =         unix_poll,
 726         .ioctl =        unix_ioctl,
 727 #ifdef CONFIG_COMPAT
 728         .compat_ioctl = unix_compat_ioctl,
 729 #endif
 730         .listen =       unix_listen,
 731         .shutdown =     unix_shutdown,
 732         .sendmsg =      unix_stream_sendmsg,
 733         .recvmsg =      unix_stream_recvmsg,
 734         .mmap =         sock_no_mmap,
 735         .sendpage =     unix_stream_sendpage,
 736         .splice_read =  unix_stream_splice_read,
 737         .set_peek_off = unix_set_peek_off,
 738         .show_fdinfo =  unix_show_fdinfo,
 739 };
 740
 741 static const struct proto_ops unix_dgram_ops = {
 742         .family =       PF_UNIX,
 743         .owner =        THIS_MODULE,
 744         .release =      unix_release,
 745         .bind =         unix_bind,
 746         .connect =      unix_dgram_connect,
 747         .socketpair =   unix_socketpair,
 748         .accept =       sock_no_accept,
 749         .getname =      unix_getname,
 750         .poll =         unix_dgram_poll,
 751         .ioctl =        unix_ioctl,
 752 #ifdef CONFIG_COMPAT
 753         .compat_ioctl = unix_compat_ioctl,
 754 #endif
 755         .listen =       sock_no_listen,
 756         .shutdown =     unix_shutdown,
 757         .sendmsg =      unix_dgram_sendmsg,
 758         .read_sock =    unix_read_sock,
 759         .recvmsg =      unix_dgram_recvmsg,
 760         .mmap =         sock_no_mmap,
 761         .sendpage =     sock_no_sendpage,
 762         .set_peek_off = unix_set_peek_off,
 763         .show_fdinfo =  unix_show_fdinfo,
 764 };
 765
 766 static const struct proto_ops unix_seqpacket_ops = {
 767         .family =       PF_UNIX,
 768         .owner =        THIS_MODULE,
 769         .release =      unix_release,
 770         .bind =         unix_bind,
 771         .connect =      unix_stream_connect,
 772         .socketpair =   unix_socketpair,
 773         .accept =       unix_accept,
 774         .getname =      unix_getname,
 775         .poll =         unix_dgram_poll,
 776         .ioctl =        unix_ioctl,
 777 #ifdef CONFIG_COMPAT
 778         .compat_ioctl = unix_compat_ioctl,
 779 #endif
 780         .listen =       unix_listen,
 781         .shutdown =     unix_shutdown,
 782         .sendmsg =      unix_seqpacket_sendmsg,
 783         .recvmsg =      unix_seqpacket_recvmsg,
 784         .mmap =         sock_no_mmap,
 785         .sendpage =     sock_no_sendpage,
 786         .set_peek_off = unix_set_peek_off,
 787         .show_fdinfo =  unix_show_fdinfo,
 788 };
 789
 790 static void unix_close(struct sock *sk, long timeout)
 791 {
 792         /* Nothing to do here, unix socket does not need a ->close().
 793          * This is merely for sockmap.
 794          */
 795 }
 796
 797 struct proto unix_proto = {
 798         .name                   = "UNIX",
 799         .owner                  = THIS_MODULE,
 800         .obj_size               = sizeof(struct unix_sock),
 801         .close                  = unix_close,
 802 #ifdef CONFIG_BPF_SYSCALL
 803         .psock_update_sk_prot   = unix_bpf_update_proto,
 804 #endif
 805 };
 806
 807 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 808 {
 809         struct sock *sk = NULL;
 810         struct unix_sock *u;
 811
 812         atomic_long_inc(&unix_nr_socks);
 813         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 814                 goto out;
 815
 816         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 817         if (!sk)
 818                 goto out;
 819
 820         sock_init_data(sock, sk);
 821
 822         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 823         sk->sk_write_space      = unix_write_space;
 824         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 825         sk->sk_destruct         = unix_sock_destructor;
 826         u         = unix_sk(sk);
 827         u->path.dentry = NULL;
 828         u->path.mnt = NULL;
 829         spin_lock_init(&u->lock);
 830         atomic_long_set(&u->inflight, 0);
 831         INIT_LIST_HEAD(&u->link);
 832         mutex_init(&u->iolock); /* single task reading lock */
 833         mutex_init(&u->bindlock); /* single task binding lock */
 834         init_waitqueue_head(&u->peer_wait);
 835         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 836         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 837         unix_insert_socket(unix_sockets_unbound(sk), sk);
 838 out:
 839         if (sk == NULL)
 840                 atomic_long_dec(&unix_nr_socks);
 841         else {
 842                 local_bh_disable();
 843                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 844                 local_bh_enable();
 845         }
 846         return sk;
 847 }
 848
 849 static int unix_create(struct net *net, struct socket *sock, int protocol,
 850                        int kern)
 851 {
 852         if (protocol && protocol != PF_UNIX)
 853                 return -EPROTONOSUPPORT;
 854
 855         sock->state = SS_UNCONNECTED;
 856
 857         switch (sock->type) {
 858         case SOCK_STREAM:
 859                 sock->ops = &unix_stream_ops;
 860                 break;
 861                 /*
 862                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 863                  *      nothing uses it.
 864                  */
 865         case SOCK_RAW:
 866                 sock->type = SOCK_DGRAM;
 867                 fallthrough;
 868         case SOCK_DGRAM:
 869                 sock->ops = &unix_dgram_ops;
 870                 break;
 871         case SOCK_SEQPACKET:
 872                 sock->ops = &unix_seqpacket_ops;
 873                 break;
 874         default:
 875                 return -ESOCKTNOSUPPORT;
 876         }
 877
 878         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 879 }
 880
 881 static int unix_release(struct socket *sock)
 882 {
 883         struct sock *sk = sock->sk;
 884
 885         if (!sk)
 886                 return 0;
 887
 888         sk->sk_prot->close(sk, 0);
 889         unix_release_sock(sk, 0);
 890         sock->sk = NULL;
 891
 892         return 0;
 893 }
 894
 895 static int unix_autobind(struct socket *sock)
 896 {
 897         struct sock *sk = sock->sk;
 898         struct net *net = sock_net(sk);
 899         struct unix_sock *u = unix_sk(sk);
 900         static u32 ordernum = 1;
 901         struct unix_address *addr;
 902         int err;
 903         unsigned int retries = 0;
 904
 905         err = mutex_lock_interruptible(&u->bindlock);
 906         if (err)
 907                 return err;
 908
 909         if (u->addr)
 910                 goto out;
 911
 912         err = -ENOMEM;
 913         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 914         if (!addr)
 915                 goto out;
 916
 917         addr->name->sun_family = AF_UNIX;
 918         refcount_set(&addr->refcnt, 1);
 919
 920 retry:
 921         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 922         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 923         addr->hash ^= sk->sk_type;
 924
 925         spin_lock(&unix_table_lock);
 926         ordernum = (ordernum+1)&0xFFFFF;
 927
 928         if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) {
 929                 spin_unlock(&unix_table_lock);
 930                 /*
 931                  * __unix_find_socket_byname() may take long time if many names
 932                  * are already in use.
 933                  */
 934                 cond_resched();
 935                 /* Give up if all names seems to be in use. */
 936                 if (retries++ == 0xFFFFF) {
 937                         err = -ENOSPC;
 938                         kfree(addr);
 939                         goto out;
 940                 }
 941                 goto retry;
 942         }
 943
 944         __unix_set_addr(sk, addr, addr->hash);
 945         spin_unlock(&unix_table_lock);
 946         err = 0;
 947
 948 out:    mutex_unlock(&u->bindlock);
 949         return err;
 950 }
 951
 952 static struct sock *unix_find_other(struct net *net,
 953                                     struct sockaddr_un *sunname, int len,
 954                                     int type, unsigned int hash, int *error)
 955 {
 956         struct sock *u;
 957         struct path path;
 958         int err = 0;
 959
 960         if (sunname->sun_path[0]) {
 961                 struct inode *inode;
 962                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 963                 if (err)
 964                         goto fail;
 965                 inode = d_backing_inode(path.dentry);
 966                 err = path_permission(&path, MAY_WRITE);
 967                 if (err)
 968                         goto put_fail;
 969
 970                 err = -ECONNREFUSED;
 971                 if (!S_ISSOCK(inode->i_mode))
 972                         goto put_fail;
 973                 u = unix_find_socket_byinode(inode);
 974                 if (!u)
 975                         goto put_fail;
 976
 977                 if (u->sk_type == type)
 978                         touch_atime(&path);
 979
 980                 path_put(&path);
 981
 982                 err = -EPROTOTYPE;
 983                 if (u->sk_type != type) {
 984                         sock_put(u);
 985                         goto fail;
 986                 }
 987         } else {
 988                 err = -ECONNREFUSED;
 989                 u = unix_find_socket_byname(net, sunname, len, type ^ hash);
 990                 if (u) {
 991                         struct dentry *dentry;
 992                         dentry = unix_sk(u)->path.dentry;
 993                         if (dentry)
 994                                 touch_atime(&unix_sk(u)->path);
 995                 } else
 996                         goto fail;
 997         }
 998         return u;
 999
1000 put_fail:
1001         path_put(&path);
1002 fail:
1003         *error = err;
1004         return NULL;
1005 }
1006
1007 static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
1008 {
1009         struct unix_sock *u = unix_sk(sk);
1010         umode_t mode = S_IFSOCK |
1011                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1012         struct user_namespace *ns; // barf...
1013         struct path parent;
1014         struct dentry *dentry;
1015         unsigned int hash;
1016         int err;
1017
1018         /*
1019          * Get the parent directory, calculate the hash for last
1020          * component.
1021          */
1022         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1023         if (IS_ERR(dentry))
1024                 return PTR_ERR(dentry);
1025         ns = mnt_user_ns(parent.mnt);
1026
1027         /*
1028          * All right, let's create it.
1029          */
1030         err = security_path_mknod(&parent, dentry, mode, 0);
1031         if (!err)
1032                 err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1033         if (err)
1034                 goto out;
1035         err = mutex_lock_interruptible(&u->bindlock);
1036         if (err)
1037                 goto out_unlink;
1038         if (u->addr)
1039                 goto out_unlock;
1040
1041         addr->hash = UNIX_HASH_SIZE;
1042         hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1043         spin_lock(&unix_table_lock);
1044         u->path.mnt = mntget(parent.mnt);
1045         u->path.dentry = dget(dentry);
1046         __unix_set_addr(sk, addr, hash);
1047         spin_unlock(&unix_table_lock);
1048         mutex_unlock(&u->bindlock);
1049         done_path_create(&parent, dentry);
1050         return 0;
1051
1052 out_unlock:
1053         mutex_unlock(&u->bindlock);
1054         err = -EINVAL;
1055 out_unlink:
1056         /* failed after successful mknod?  unlink what we'd created... */
1057         vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1058 out:
1059         done_path_create(&parent, dentry);
1060         return err;
1061 }
1062
1063 static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
1064 {
1065         struct unix_sock *u = unix_sk(sk);
1066         int err;
1067
1068         err = mutex_lock_interruptible(&u->bindlock);
1069         if (err)
1070                 return err;
1071
1072         if (u->addr) {
1073                 mutex_unlock(&u->bindlock);
1074                 return -EINVAL;
1075         }
1076
1077         spin_lock(&unix_table_lock);
1078         if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
1079                                       addr->hash)) {
1080                 spin_unlock(&unix_table_lock);
1081                 mutex_unlock(&u->bindlock);
1082                 return -EADDRINUSE;
1083         }
1084         __unix_set_addr(sk, addr, addr->hash);
1085         spin_unlock(&unix_table_lock);
1086         mutex_unlock(&u->bindlock);
1087         return 0;
1088 }
1089
1090 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1091 {
1092         struct sock *sk = sock->sk;
1093         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1094         char *sun_path = sunaddr->sun_path;
1095         int err;
1096         unsigned int hash;
1097         struct unix_address *addr;
1098
1099         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1100             sunaddr->sun_family != AF_UNIX)
1101                 return -EINVAL;
1102
1103         if (addr_len == sizeof(short))
1104                 return unix_autobind(sock);
1105
1106         err = unix_mkname(sunaddr, addr_len, &hash);
1107         if (err < 0)
1108                 return err;
1109         addr_len = err;
1110         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1111         if (!addr)
1112                 return -ENOMEM;
1113
1114         memcpy(addr->name, sunaddr, addr_len);
1115         addr->len = addr_len;
1116         addr->hash = hash ^ sk->sk_type;
1117         refcount_set(&addr->refcnt, 1);
1118
1119         if (sun_path[0])
1120                 err = unix_bind_bsd(sk, addr);
1121         else
1122                 err = unix_bind_abstract(sk, addr);
1123         if (err)
1124                 unix_release_addr(addr);
1125         return err == -EEXIST ? -EADDRINUSE : err;
1126 }
1127
1128 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1129 {
1130         if (unlikely(sk1 == sk2) || !sk2) {
1131                 unix_state_lock(sk1);
1132                 return;
1133         }
1134         if (sk1 < sk2) {
1135                 unix_state_lock(sk1);
1136                 unix_state_lock_nested(sk2);
1137         } else {
1138                 unix_state_lock(sk2);
1139                 unix_state_lock_nested(sk1);
1140         }
1141 }
1142
1143 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1144 {
1145         if (unlikely(sk1 == sk2) || !sk2) {
1146                 unix_state_unlock(sk1);
1147                 return;
1148         }
1149         unix_state_unlock(sk1);
1150         unix_state_unlock(sk2);
1151 }
1152
1153 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1154                               int alen, int flags)
1155 {
1156         struct sock *sk = sock->sk;
1157         struct net *net = sock_net(sk);
1158         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1159         struct sock *other;
1160         unsigned int hash;
1161         int err;
1162
1163         err = -EINVAL;
1164         if (alen < offsetofend(struct sockaddr, sa_family))
1165                 goto out;
1166
1167         if (addr->sa_family != AF_UNSPEC) {
1168                 err = unix_mkname(sunaddr, alen, &hash);
1169                 if (err < 0)
1170                         goto out;
1171                 alen = err;
1172
1173                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1174                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1175                         goto out;
1176
1177 restart:
1178                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1179                 if (!other)
1180                         goto out;
1181
1182                 unix_state_double_lock(sk, other);
1183
1184                 /* Apparently VFS overslept socket death. Retry. */
1185                 if (sock_flag(other, SOCK_DEAD)) {
1186                         unix_state_double_unlock(sk, other);
1187                         sock_put(other);
1188                         goto restart;
1189                 }
1190
1191                 err = -EPERM;
1192                 if (!unix_may_send(sk, other))
1193                         goto out_unlock;
1194
1195                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1196                 if (err)
1197                         goto out_unlock;
1198
1199         } else {
1200                 /*
1201                  *      1003.1g breaking connected state with AF_UNSPEC
1202                  */
1203                 other = NULL;
1204                 unix_state_double_lock(sk, other);
1205         }
1206
1207         /*
1208          * If it was connected, reconnect.
1209          */
1210         if (unix_peer(sk)) {
1211                 struct sock *old_peer = unix_peer(sk);
1212                 unix_peer(sk) = other;
1213                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1214
1215                 unix_state_double_unlock(sk, other);
1216
1217                 if (other != old_peer)
1218                         unix_dgram_disconnected(sk, old_peer);
1219                 sock_put(old_peer);
1220         } else {
1221                 unix_peer(sk) = other;
1222                 unix_state_double_unlock(sk, other);
1223         }
1224
1225         if (unix_peer(sk))
1226                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1227         return 0;
1228
1229 out_unlock:
1230         unix_state_double_unlock(sk, other);
1231         sock_put(other);
1232 out:
1233         return err;
1234 }
1235
1236 static long unix_wait_for_peer(struct sock *other, long timeo)
1237         __releases(&unix_sk(other)->lock)
1238 {
1239         struct unix_sock *u = unix_sk(other);
1240         int sched;
1241         DEFINE_WAIT(wait);
1242
1243         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1244
1245         sched = !sock_flag(other, SOCK_DEAD) &&
1246                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1247                 unix_recvq_full(other);
1248
1249         unix_state_unlock(other);
1250
1251         if (sched)
1252                 timeo = schedule_timeout(timeo);
1253
1254         finish_wait(&u->peer_wait, &wait);
1255         return timeo;
1256 }
1257
1258 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1259                                int addr_len, int flags)
1260 {
1261         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1262         struct sock *sk = sock->sk;
1263         struct net *net = sock_net(sk);
1264         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1265         struct sock *newsk = NULL;
1266         struct sock *other = NULL;
1267         struct sk_buff *skb = NULL;
1268         unsigned int hash;
1269         int st;
1270         int err;
1271         long timeo;
1272
1273         err = unix_mkname(sunaddr, addr_len, &hash);
1274         if (err < 0)
1275                 goto out;
1276         addr_len = err;
1277
1278         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1279             (err = unix_autobind(sock)) != 0)
1280                 goto out;
1281
1282         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1283
1284         /* First of all allocate resources.
1285            If we will make it after state is locked,
1286            we will have to recheck all again in any case.
1287          */
1288
1289         err = -ENOMEM;
1290
1291         /* create new sock for complete connection */
1292         newsk = unix_create1(sock_net(sk), NULL, 0);
1293         if (newsk == NULL)
1294                 goto out;
1295
1296         /* Allocate skb for sending to listening sock */
1297         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1298         if (skb == NULL)
1299                 goto out;
1300
1301 restart:
1302         /*  Find listening sock. */
1303         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1304         if (!other)
1305                 goto out;
1306
1307         /* Latch state of peer */
1308         unix_state_lock(other);
1309
1310         /* Apparently VFS overslept socket death. Retry. */
1311         if (sock_flag(other, SOCK_DEAD)) {
1312                 unix_state_unlock(other);
1313                 sock_put(other);
1314                 goto restart;
1315         }
1316
1317         err = -ECONNREFUSED;
1318         if (other->sk_state != TCP_LISTEN)
1319                 goto out_unlock;
1320         if (other->sk_shutdown & RCV_SHUTDOWN)
1321                 goto out_unlock;
1322
1323         if (unix_recvq_full(other)) {
1324                 err = -EAGAIN;
1325                 if (!timeo)
1326                         goto out_unlock;
1327
1328                 timeo = unix_wait_for_peer(other, timeo);
1329
1330                 err = sock_intr_errno(timeo);
1331                 if (signal_pending(current))
1332                         goto out;
1333                 sock_put(other);
1334                 goto restart;
1335         }
1336
1337         /* Latch our state.
1338
1339            It is tricky place. We need to grab our state lock and cannot
1340            drop lock on peer. It is dangerous because deadlock is
1341            possible. Connect to self case and simultaneous
1342            attempt to connect are eliminated by checking socket
1343            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1344            check this before attempt to grab lock.
1345
1346            Well, and we have to recheck the state after socket locked.
1347          */
1348         st = sk->sk_state;
1349
1350         switch (st) {
1351         case TCP_CLOSE:
1352                 /* This is ok... continue with connect */
1353                 break;
1354         case TCP_ESTABLISHED:
1355                 /* Socket is already connected */
1356                 err = -EISCONN;
1357                 goto out_unlock;
1358         default:
1359                 err = -EINVAL;
1360                 goto out_unlock;
1361         }
1362
1363         unix_state_lock_nested(sk);
1364
1365         if (sk->sk_state != st) {
1366                 unix_state_unlock(sk);
1367                 unix_state_unlock(other);
1368                 sock_put(other);
1369                 goto restart;
1370         }
1371
1372         err = security_unix_stream_connect(sk, other, newsk);
1373         if (err) {
1374                 unix_state_unlock(sk);
1375                 goto out_unlock;
1376         }
1377
1378         /* The way is open! Fastly set all the necessary fields... */
1379
1380         sock_hold(sk);
1381         unix_peer(newsk)        = sk;
1382         newsk->sk_state         = TCP_ESTABLISHED;
1383         newsk->sk_type          = sk->sk_type;
1384         init_peercred(newsk);
1385         newu = unix_sk(newsk);
1386         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1387         otheru = unix_sk(other);
1388
1389         /* copy address information from listening to new sock
1390          *
1391          * The contents of *(otheru->addr) and otheru->path
1392          * are seen fully set up here, since we have found
1393          * otheru in hash under unix_table_lock.  Insertion
1394          * into the hash chain we'd found it in had been done
1395          * in an earlier critical area protected by unix_table_lock,
1396          * the same one where we'd set *(otheru->addr) contents,
1397          * as well as otheru->path and otheru->addr itself.
1398          *
1399          * Using smp_store_release() here to set newu->addr
1400          * is enough to make those stores, as well as stores
1401          * to newu->path visible to anyone who gets newu->addr
1402          * by smp_load_acquire().  IOW, the same warranties
1403          * as for unix_sock instances bound in unix_bind() or
1404          * in unix_autobind().
1405          */
1406         if (otheru->path.dentry) {
1407                 path_get(&otheru->path);
1408                 newu->path = otheru->path;
1409         }
1410         refcount_inc(&otheru->addr->refcnt);
1411         smp_store_release(&newu->addr, otheru->addr);
1412
1413         /* Set credentials */
1414         copy_peercred(sk, other);
1415
1416         sock->state     = SS_CONNECTED;
1417         sk->sk_state    = TCP_ESTABLISHED;
1418         sock_hold(newsk);
1419
1420         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1421         unix_peer(sk)   = newsk;
1422
1423         unix_state_unlock(sk);
1424
1425         /* take ten and send info to listening sock */
1426         spin_lock(&other->sk_receive_queue.lock);
1427         __skb_queue_tail(&other->sk_receive_queue, skb);
1428         spin_unlock(&other->sk_receive_queue.lock);
1429         unix_state_unlock(other);
1430         other->sk_data_ready(other);
1431         sock_put(other);
1432         return 0;
1433
1434 out_unlock:
1435         if (other)
1436                 unix_state_unlock(other);
1437
1438 out:
1439         kfree_skb(skb);
1440         if (newsk)
1441                 unix_release_sock(newsk, 0);
1442         if (other)
1443                 sock_put(other);
1444         return err;
1445 }
1446
1447 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1448 {
1449         struct sock *ska = socka->sk, *skb = sockb->sk;
1450
1451         /* Join our sockets back to back */
1452         sock_hold(ska);
1453         sock_hold(skb);
1454         unix_peer(ska) = skb;
1455         unix_peer(skb) = ska;
1456         init_peercred(ska);
1457         init_peercred(skb);
1458
1459         ska->sk_state = TCP_ESTABLISHED;
1460         skb->sk_state = TCP_ESTABLISHED;
1461         socka->state  = SS_CONNECTED;
1462         sockb->state  = SS_CONNECTED;
1463         return 0;
1464 }
1465
1466 static void unix_sock_inherit_flags(const struct socket *old,
1467                                     struct socket *new)
1468 {
1469         if (test_bit(SOCK_PASSCRED, &old->flags))
1470                 set_bit(SOCK_PASSCRED, &new->flags);
1471         if (test_bit(SOCK_PASSSEC, &old->flags))
1472                 set_bit(SOCK_PASSSEC, &new->flags);
1473 }
1474
1475 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1476                        bool kern)
1477 {
1478         struct sock *sk = sock->sk;
1479         struct sock *tsk;
1480         struct sk_buff *skb;
1481         int err;
1482
1483         err = -EOPNOTSUPP;
1484         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1485                 goto out;
1486
1487         err = -EINVAL;
1488         if (sk->sk_state != TCP_LISTEN)
1489                 goto out;
1490
1491         /* If socket state is TCP_LISTEN it cannot change (for now...),
1492          * so that no locks are necessary.
1493          */
1494
1495         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1496         if (!skb) {
1497                 /* This means receive shutdown. */
1498                 if (err == 0)
1499                         err = -EINVAL;
1500                 goto out;
1501         }
1502
1503         tsk = skb->sk;
1504         skb_free_datagram(sk, skb);
1505         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1506
1507         /* attach accepted sock to socket */
1508         unix_state_lock(tsk);
1509         newsock->state = SS_CONNECTED;
1510         unix_sock_inherit_flags(sock, newsock);
1511         sock_graft(tsk, newsock);
1512         unix_state_unlock(tsk);
1513         return 0;
1514
1515 out:
1516         return err;
1517 }
1518
1519
1520 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1521 {
1522         struct sock *sk = sock->sk;
1523         struct unix_address *addr;
1524         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1525         int err = 0;
1526
1527         if (peer) {
1528                 sk = unix_peer_get(sk);
1529
1530                 err = -ENOTCONN;
1531                 if (!sk)
1532                         goto out;
1533                 err = 0;
1534         } else {
1535                 sock_hold(sk);
1536         }
1537
1538         addr = smp_load_acquire(&unix_sk(sk)->addr);
1539         if (!addr) {
1540                 sunaddr->sun_family = AF_UNIX;
1541                 sunaddr->sun_path[0] = 0;
1542                 err = sizeof(short);
1543         } else {
1544                 err = addr->len;
1545                 memcpy(sunaddr, addr->name, addr->len);
1546         }
1547         sock_put(sk);
1548 out:
1549         return err;
1550 }
1551
1552 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1553 {
1554         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1555
1556         /*
1557          * Garbage collection of unix sockets starts by selecting a set of
1558          * candidate sockets which have reference only from being in flight
1559          * (total_refs == inflight_refs).  This condition is checked once during
1560          * the candidate collection phase, and candidates are marked as such, so
1561          * that non-candidates can later be ignored.  While inflight_refs is
1562          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1563          * is an instantaneous decision.
1564          *
1565          * Once a candidate, however, the socket must not be reinstalled into a
1566          * file descriptor while the garbage collection is in progress.
1567          *
1568          * If the above conditions are met, then the directed graph of
1569          * candidates (*) does not change while unix_gc_lock is held.
1570          *
1571          * Any operations that changes the file count through file descriptors
1572          * (dup, close, sendmsg) does not change the graph since candidates are
1573          * not installed in fds.
1574          *
1575          * Dequeing a candidate via recvmsg would install it into an fd, but
1576          * that takes unix_gc_lock to decrement the inflight count, so it's
1577          * serialized with garbage collection.
1578          *
1579          * MSG_PEEK is special in that it does not change the inflight count,
1580          * yet does install the socket into an fd.  The following lock/unlock
1581          * pair is to ensure serialization with garbage collection.  It must be
1582          * done between incrementing the file count and installing the file into
1583          * an fd.
1584          *
1585          * If garbage collection starts after the barrier provided by the
1586          * lock/unlock, then it will see the elevated refcount and not mark this
1587          * as a candidate.  If a garbage collection is already in progress
1588          * before the file count was incremented, then the lock/unlock pair will
1589          * ensure that garbage collection is finished before progressing to
1590          * installing the fd.
1591          *
1592          * (*) A -> B where B is on the queue of A or B is on the queue of C
1593          * which is on the queue of listening socket A.
1594          */
1595         spin_lock(&unix_gc_lock);
1596         spin_unlock(&unix_gc_lock);
1597 }
1598
1599 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1600 {
1601         int err = 0;
1602
1603         UNIXCB(skb).pid  = get_pid(scm->pid);
1604         UNIXCB(skb).uid = scm->creds.uid;
1605         UNIXCB(skb).gid = scm->creds.gid;
1606         UNIXCB(skb).fp = NULL;
1607         unix_get_secdata(scm, skb);
1608         if (scm->fp && send_fds)
1609                 err = unix_attach_fds(scm, skb);
1610
1611         skb->destructor = unix_destruct_scm;
1612         return err;
1613 }
1614
1615 static bool unix_passcred_enabled(const struct socket *sock,
1616                                   const struct sock *other)
1617 {
1618         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1619                !other->sk_socket ||
1620                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1621 }
1622
1623 /*
1624  * Some apps rely on write() giving SCM_CREDENTIALS
1625  * We include credentials if source or destination socket
1626  * asserted SOCK_PASSCRED.
1627  */
1628 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1629                             const struct sock *other)
1630 {
1631         if (UNIXCB(skb).pid)
1632                 return;
1633         if (unix_passcred_enabled(sock, other)) {
1634                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1635                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1636         }
1637 }
1638
1639 static int maybe_init_creds(struct scm_cookie *scm,
1640                             struct socket *socket,
1641                             const struct sock *other)
1642 {
1643         int err;
1644         struct msghdr msg = { .msg_controllen = 0 };
1645
1646         err = scm_send(socket, &msg, scm, false);
1647         if (err)
1648                 return err;
1649
1650         if (unix_passcred_enabled(socket, other)) {
1651                 scm->pid = get_pid(task_tgid(current));
1652                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1653         }
1654         return err;
1655 }
1656
1657 static bool unix_skb_scm_eq(struct sk_buff *skb,
1658                             struct scm_cookie *scm)
1659 {
1660         const struct unix_skb_parms *u = &UNIXCB(skb);
1661
1662         return u->pid == scm->pid &&
1663                uid_eq(u->uid, scm->creds.uid) &&
1664                gid_eq(u->gid, scm->creds.gid) &&
1665                unix_secdata_eq(scm, skb);
1666 }
1667
1668 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1669 {
1670         struct scm_fp_list *fp = UNIXCB(skb).fp;
1671         struct unix_sock *u = unix_sk(sk);
1672
1673         if (unlikely(fp && fp->count))
1674                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1675 }
1676
1677 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1678 {
1679         struct scm_fp_list *fp = UNIXCB(skb).fp;
1680         struct unix_sock *u = unix_sk(sk);
1681
1682         if (unlikely(fp && fp->count))
1683                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1684 }
1685
1686 /*
1687  *      Send AF_UNIX data.
1688  */
1689
1690 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1691                               size_t len)
1692 {
1693         struct sock *sk = sock->sk;
1694         struct net *net = sock_net(sk);
1695         struct unix_sock *u = unix_sk(sk);
1696         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1697         struct sock *other = NULL;
1698         int namelen = 0; /* fake GCC */
1699         int err;
1700         unsigned int hash;
1701         struct sk_buff *skb;
1702         long timeo;
1703         struct scm_cookie scm;
1704         int data_len = 0;
1705         int sk_locked;
1706
1707         wait_for_unix_gc();
1708         err = scm_send(sock, msg, &scm, false);
1709         if (err < 0)
1710                 return err;
1711
1712         err = -EOPNOTSUPP;
1713         if (msg->msg_flags&MSG_OOB)
1714                 goto out;
1715
1716         if (msg->msg_namelen) {
1717                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1718                 if (err < 0)
1719                         goto out;
1720                 namelen = err;
1721         } else {
1722                 sunaddr = NULL;
1723                 err = -ENOTCONN;
1724                 other = unix_peer_get(sk);
1725                 if (!other)
1726                         goto out;
1727         }
1728
1729         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1730             && (err = unix_autobind(sock)) != 0)
1731                 goto out;
1732
1733         err = -EMSGSIZE;
1734         if (len > sk->sk_sndbuf - 32)
1735                 goto out;
1736
1737         if (len > SKB_MAX_ALLOC) {
1738                 data_len = min_t(size_t,
1739                                  len - SKB_MAX_ALLOC,
1740                                  MAX_SKB_FRAGS * PAGE_SIZE);
1741                 data_len = PAGE_ALIGN(data_len);
1742
1743                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1744         }
1745
1746         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1747                                    msg->msg_flags & MSG_DONTWAIT, &err,
1748                                    PAGE_ALLOC_COSTLY_ORDER);
1749         if (skb == NULL)
1750                 goto out;
1751
1752         err = unix_scm_to_skb(&scm, skb, true);
1753         if (err < 0)
1754                 goto out_free;
1755
1756         skb_put(skb, len - data_len);
1757         skb->data_len = data_len;
1758         skb->len = len;
1759         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1760         if (err)
1761                 goto out_free;
1762
1763         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1764
1765 restart:
1766         if (!other) {
1767                 err = -ECONNRESET;
1768                 if (sunaddr == NULL)
1769                         goto out_free;
1770
1771                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1772                                         hash, &err);
1773                 if (other == NULL)
1774                         goto out_free;
1775         }
1776
1777         if (sk_filter(other, skb) < 0) {
1778                 /* Toss the packet but do not return any error to the sender */
1779                 err = len;
1780                 goto out_free;
1781         }
1782
1783         sk_locked = 0;
1784         unix_state_lock(other);
1785 restart_locked:
1786         err = -EPERM;
1787         if (!unix_may_send(sk, other))
1788                 goto out_unlock;
1789
1790         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1791                 /*
1792                  *      Check with 1003.1g - what should
1793                  *      datagram error
1794                  */
1795                 unix_state_unlock(other);
1796                 sock_put(other);
1797
1798                 if (!sk_locked)
1799                         unix_state_lock(sk);
1800
1801                 err = 0;
1802                 if (unix_peer(sk) == other) {
1803                         unix_peer(sk) = NULL;
1804                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1805
1806                         unix_state_unlock(sk);
1807
1808                         unix_dgram_disconnected(sk, other);
1809                         sock_put(other);
1810                         err = -ECONNREFUSED;
1811                 } else {
1812                         unix_state_unlock(sk);
1813                 }
1814
1815                 other = NULL;
1816                 if (err)
1817                         goto out_free;
1818                 goto restart;
1819         }
1820
1821         err = -EPIPE;
1822         if (other->sk_shutdown & RCV_SHUTDOWN)
1823                 goto out_unlock;
1824
1825         if (sk->sk_type != SOCK_SEQPACKET) {
1826                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1827                 if (err)
1828                         goto out_unlock;
1829         }
1830
1831         /* other == sk && unix_peer(other) != sk if
1832          * - unix_peer(sk) == NULL, destination address bound to sk
1833          * - unix_peer(sk) == sk by time of get but disconnected before lock
1834          */
1835         if (other != sk &&
1836             unlikely(unix_peer(other) != sk &&
1837             unix_recvq_full_lockless(other))) {
1838                 if (timeo) {
1839                         timeo = unix_wait_for_peer(other, timeo);
1840
1841                         err = sock_intr_errno(timeo);
1842                         if (signal_pending(current))
1843                                 goto out_free;
1844
1845                         goto restart;
1846                 }
1847
1848                 if (!sk_locked) {
1849                         unix_state_unlock(other);
1850                         unix_state_double_lock(sk, other);
1851                 }
1852
1853                 if (unix_peer(sk) != other ||
1854                     unix_dgram_peer_wake_me(sk, other)) {
1855                         err = -EAGAIN;
1856                         sk_locked = 1;
1857                         goto out_unlock;
1858                 }
1859
1860                 if (!sk_locked) {
1861                         sk_locked = 1;
1862                         goto restart_locked;
1863                 }
1864         }
1865
1866         if (unlikely(sk_locked))
1867                 unix_state_unlock(sk);
1868
1869         if (sock_flag(other, SOCK_RCVTSTAMP))
1870                 __net_timestamp(skb);
1871         maybe_add_creds(skb, sock, other);
1872         scm_stat_add(other, skb);
1873         skb_queue_tail(&other->sk_receive_queue, skb);
1874         unix_state_unlock(other);
1875         other->sk_data_ready(other);
1876         sock_put(other);
1877         scm_destroy(&scm);
1878         return len;
1879
1880 out_unlock:
1881         if (sk_locked)
1882                 unix_state_unlock(sk);
1883         unix_state_unlock(other);
1884 out_free:
1885         kfree_skb(skb);
1886 out:
1887         if (other)
1888                 sock_put(other);
1889         scm_destroy(&scm);
1890         return err;
1891 }
1892
1893 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1894  * bytes, and a minimum of a full page.
1895  */
1896 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1897
1898 #if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
1899 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
1900 {
1901         struct unix_sock *ousk = unix_sk(other);
1902         struct sk_buff *skb;
1903         int err = 0;
1904
1905         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
1906
1907         if (!skb)
1908                 return err;
1909
1910         skb_put(skb, 1);
1911         skb->len = 1;
1912         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
1913
1914         if (err) {
1915                 kfree_skb(skb);
1916                 return err;
1917         }
1918
1919         unix_state_lock(other);
1920         maybe_add_creds(skb, sock, other);
1921         skb_get(skb);
1922
1923         if (ousk->oob_skb)
1924                 kfree_skb(ousk->oob_skb);
1925
1926         ousk->oob_skb = skb;
1927
1928         scm_stat_add(other, skb);
1929         skb_queue_tail(&other->sk_receive_queue, skb);
1930         sk_send_sigurg(other);
1931         unix_state_unlock(other);
1932         other->sk_data_ready(other);
1933
1934         return err;
1935 }
1936 #endif
1937
1938 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1939                                size_t len)
1940 {
1941         struct sock *sk = sock->sk;
1942         struct sock *other = NULL;
1943         int err, size;
1944         struct sk_buff *skb;
1945         int sent = 0;
1946         struct scm_cookie scm;
1947         bool fds_sent = false;
1948         int data_len;
1949
1950         wait_for_unix_gc();
1951         err = scm_send(sock, msg, &scm, false);
1952         if (err < 0)
1953                 return err;
1954
1955         err = -EOPNOTSUPP;
1956         if (msg->msg_flags & MSG_OOB) {
1957 #if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
1958                 if (len)
1959                         len--;
1960                 else
1961 #endif
1962                         goto out_err;
1963         }
1964
1965         if (msg->msg_namelen) {
1966                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1967                 goto out_err;
1968         } else {
1969                 err = -ENOTCONN;
1970                 other = unix_peer(sk);
1971                 if (!other)
1972                         goto out_err;
1973         }
1974
1975         if (sk->sk_shutdown & SEND_SHUTDOWN)
1976                 goto pipe_err;
1977
1978         while (sent < len) {
1979                 size = len - sent;
1980
1981                 /* Keep two messages in the pipe so it schedules better */
1982                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1983
1984                 /* allow fallback to order-0 allocations */
1985                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1986
1987                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1988
1989                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1990
1991                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1992                                            msg->msg_flags & MSG_DONTWAIT, &err,
1993                                            get_order(UNIX_SKB_FRAGS_SZ));
1994                 if (!skb)
1995                         goto out_err;
1996
1997                 /* Only send the fds in the first buffer */
1998                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1999                 if (err < 0) {
2000                         kfree_skb(skb);
2001                         goto out_err;
2002                 }
2003                 fds_sent = true;
2004
2005                 skb_put(skb, size - data_len);
2006                 skb->data_len = data_len;
2007                 skb->len = size;
2008                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2009                 if (err) {
2010                         kfree_skb(skb);
2011                         goto out_err;
2012                 }
2013
2014                 unix_state_lock(other);
2015
2016                 if (sock_flag(other, SOCK_DEAD) ||
2017                     (other->sk_shutdown & RCV_SHUTDOWN))
2018                         goto pipe_err_free;
2019
2020                 maybe_add_creds(skb, sock, other);
2021                 scm_stat_add(other, skb);
2022                 skb_queue_tail(&other->sk_receive_queue, skb);
2023                 unix_state_unlock(other);
2024                 other->sk_data_ready(other);
2025                 sent += size;
2026         }
2027
2028 #if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
2029         if (msg->msg_flags & MSG_OOB) {
2030                 err = queue_oob(sock, msg, other);
2031                 if (err)
2032                         goto out_err;
2033                 sent++;
2034         }
2035 #endif
2036
2037         scm_destroy(&scm);
2038
2039         return sent;
2040
2041 pipe_err_free:
2042         unix_state_unlock(other);
2043         kfree_skb(skb);
2044 pipe_err:
2045         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2046                 send_sig(SIGPIPE, current, 0);
2047         err = -EPIPE;
2048 out_err:
2049         scm_destroy(&scm);
2050         return sent ? : err;
2051 }
2052
2053 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2054                                     int offset, size_t size, int flags)
2055 {
2056         int err;
2057         bool send_sigpipe = false;
2058         bool init_scm = true;
2059         struct scm_cookie scm;
2060         struct sock *other, *sk = socket->sk;
2061         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2062
2063         if (flags & MSG_OOB)
2064                 return -EOPNOTSUPP;
2065
2066         other = unix_peer(sk);
2067         if (!other || sk->sk_state != TCP_ESTABLISHED)
2068                 return -ENOTCONN;
2069
2070         if (false) {
2071 alloc_skb:
2072                 unix_state_unlock(other);
2073                 mutex_unlock(&unix_sk(other)->iolock);
2074                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2075                                               &err, 0);
2076                 if (!newskb)
2077                         goto err;
2078         }
2079
2080         /* we must acquire iolock as we modify already present
2081          * skbs in the sk_receive_queue and mess with skb->len
2082          */
2083         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2084         if (err) {
2085                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2086                 goto err;
2087         }
2088
2089         if (sk->sk_shutdown & SEND_SHUTDOWN) {
2090                 err = -EPIPE;
2091                 send_sigpipe = true;
2092                 goto err_unlock;
2093         }
2094
2095         unix_state_lock(other);
2096
2097         if (sock_flag(other, SOCK_DEAD) ||
2098             other->sk_shutdown & RCV_SHUTDOWN) {
2099                 err = -EPIPE;
2100                 send_sigpipe = true;
2101                 goto err_state_unlock;
2102         }
2103
2104         if (init_scm) {
2105                 err = maybe_init_creds(&scm, socket, other);
2106                 if (err)
2107                         goto err_state_unlock;
2108                 init_scm = false;
2109         }
2110
2111         skb = skb_peek_tail(&other->sk_receive_queue);
2112         if (tail && tail == skb) {
2113                 skb = newskb;
2114         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2115                 if (newskb) {
2116                         skb = newskb;
2117                 } else {
2118                         tail = skb;
2119                         goto alloc_skb;
2120                 }
2121         } else if (newskb) {
2122                 /* this is fast path, we don't necessarily need to
2123                  * call to kfree_skb even though with newskb == NULL
2124                  * this - does no harm
2125                  */
2126                 consume_skb(newskb);
2127                 newskb = NULL;
2128         }
2129
2130         if (skb_append_pagefrags(skb, page, offset, size)) {
2131                 tail = skb;
2132                 goto alloc_skb;
2133         }
2134
2135         skb->len += size;
2136         skb->data_len += size;
2137         skb->truesize += size;
2138         refcount_add(size, &sk->sk_wmem_alloc);
2139
2140         if (newskb) {
2141                 err = unix_scm_to_skb(&scm, skb, false);
2142                 if (err)
2143                         goto err_state_unlock;
2144                 spin_lock(&other->sk_receive_queue.lock);
2145                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2146                 spin_unlock(&other->sk_receive_queue.lock);
2147         }
2148
2149         unix_state_unlock(other);
2150         mutex_unlock(&unix_sk(other)->iolock);
2151
2152         other->sk_data_ready(other);
2153         scm_destroy(&scm);
2154         return size;
2155
2156 err_state_unlock:
2157         unix_state_unlock(other);
2158 err_unlock:
2159         mutex_unlock(&unix_sk(other)->iolock);
2160 err:
2161         kfree_skb(newskb);
2162         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2163                 send_sig(SIGPIPE, current, 0);
2164         if (!init_scm)
2165                 scm_destroy(&scm);
2166         return err;
2167 }
2168
2169 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2170                                   size_t len)
2171 {
2172         int err;
2173         struct sock *sk = sock->sk;
2174
2175         err = sock_error(sk);
2176         if (err)
2177                 return err;
2178
2179         if (sk->sk_state != TCP_ESTABLISHED)
2180                 return -ENOTCONN;
2181
2182         if (msg->msg_namelen)
2183                 msg->msg_namelen = 0;
2184
2185         return unix_dgram_sendmsg(sock, msg, len);
2186 }
2187
2188 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2189                                   size_t size, int flags)
2190 {
2191         struct sock *sk = sock->sk;
2192
2193         if (sk->sk_state != TCP_ESTABLISHED)
2194                 return -ENOTCONN;
2195
2196         return unix_dgram_recvmsg(sock, msg, size, flags);
2197 }
2198
2199 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2200 {
2201         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2202
2203         if (addr) {
2204                 msg->msg_namelen = addr->len;
2205                 memcpy(msg->msg_name, addr->name, addr->len);
2206         }
2207 }
2208
2209 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2210                          int flags)
2211 {
2212         struct scm_cookie scm;
2213         struct socket *sock = sk->sk_socket;
2214         struct unix_sock *u = unix_sk(sk);
2215         struct sk_buff *skb, *last;
2216         long timeo;
2217         int skip;
2218         int err;
2219
2220         err = -EOPNOTSUPP;
2221         if (flags&MSG_OOB)
2222                 goto out;
2223
2224         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2225
2226         do {
2227                 mutex_lock(&u->iolock);
2228
2229                 skip = sk_peek_offset(sk, flags);
2230                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2231                                               &skip, &err, &last);
2232                 if (skb) {
2233                         if (!(flags & MSG_PEEK))
2234                                 scm_stat_del(sk, skb);
2235                         break;
2236                 }
2237
2238                 mutex_unlock(&u->iolock);
2239
2240                 if (err != -EAGAIN)
2241                         break;
2242         } while (timeo &&
2243                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2244                                               &err, &timeo, last));
2245
2246         if (!skb) { /* implies iolock unlocked */
2247                 unix_state_lock(sk);
2248                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2249                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2250                     (sk->sk_shutdown & RCV_SHUTDOWN))
2251                         err = 0;
2252                 unix_state_unlock(sk);
2253                 goto out;
2254         }
2255
2256         if (wq_has_sleeper(&u->peer_wait))
2257                 wake_up_interruptible_sync_poll(&u->peer_wait,
2258                                                 EPOLLOUT | EPOLLWRNORM |
2259                                                 EPOLLWRBAND);
2260
2261         if (msg->msg_name)
2262                 unix_copy_addr(msg, skb->sk);
2263
2264         if (size > skb->len - skip)
2265                 size = skb->len - skip;
2266         else if (size < skb->len - skip)
2267                 msg->msg_flags |= MSG_TRUNC;
2268
2269         err = skb_copy_datagram_msg(skb, skip, msg, size);
2270         if (err)
2271                 goto out_free;
2272
2273         if (sock_flag(sk, SOCK_RCVTSTAMP))
2274                 __sock_recv_timestamp(msg, sk, skb);
2275
2276         memset(&scm, 0, sizeof(scm));
2277
2278         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2279         unix_set_secdata(&scm, skb);
2280
2281         if (!(flags & MSG_PEEK)) {
2282                 if (UNIXCB(skb).fp)
2283                         unix_detach_fds(&scm, skb);
2284
2285                 sk_peek_offset_bwd(sk, skb->len);
2286         } else {
2287                 /* It is questionable: on PEEK we could:
2288                    - do not return fds - good, but too simple 8)
2289                    - return fds, and do not return them on read (old strategy,
2290                      apparently wrong)
2291                    - clone fds (I chose it for now, it is the most universal
2292                      solution)
2293
2294                    POSIX 1003.1g does not actually define this clearly
2295                    at all. POSIX 1003.1g doesn't define a lot of things
2296                    clearly however!
2297
2298                 */
2299
2300                 sk_peek_offset_fwd(sk, size);
2301
2302                 if (UNIXCB(skb).fp)
2303                         unix_peek_fds(&scm, skb);
2304         }
2305         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2306
2307         scm_recv(sock, msg, &scm, flags);
2308
2309 out_free:
2310         skb_free_datagram(sk, skb);
2311         mutex_unlock(&u->iolock);
2312 out:
2313         return err;
2314 }
2315
2316 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2317                               int flags)
2318 {
2319         struct sock *sk = sock->sk;
2320
2321 #ifdef CONFIG_BPF_SYSCALL
2322         if (sk->sk_prot != &unix_proto)
2323                 return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2324                                             flags & ~MSG_DONTWAIT, NULL);
2325 #endif
2326         return __unix_dgram_recvmsg(sk, msg, size, flags);
2327 }
2328
2329 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
2330                           sk_read_actor_t recv_actor)
2331 {
2332         int copied = 0;
2333
2334         while (1) {
2335                 struct unix_sock *u = unix_sk(sk);
2336                 struct sk_buff *skb;
2337                 int used, err;
2338
2339                 mutex_lock(&u->iolock);
2340                 skb = skb_recv_datagram(sk, 0, 1, &err);
2341                 mutex_unlock(&u->iolock);
2342                 if (!skb)
2343                         return err;
2344
2345                 used = recv_actor(desc, skb, 0, skb->len);
2346                 if (used <= 0) {
2347                         if (!copied)
2348                                 copied = used;
2349                         kfree_skb(skb);
2350                         break;
2351                 } else if (used <= skb->len) {
2352                         copied += used;
2353                 }
2354
2355                 kfree_skb(skb);
2356                 if (!desc->count)
2357                         break;
2358         }
2359
2360         return copied;
2361 }
2362
2363 /*
2364  *      Sleep until more data has arrived. But check for races..
2365  */
2366 static long unix_stream_data_wait(struct sock *sk, long timeo,
2367                                   struct sk_buff *last, unsigned int last_len,
2368                                   bool freezable)
2369 {
2370         struct sk_buff *tail;
2371         DEFINE_WAIT(wait);
2372
2373         unix_state_lock(sk);
2374
2375         for (;;) {
2376                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2377
2378                 tail = skb_peek_tail(&sk->sk_receive_queue);
2379                 if (tail != last ||
2380                     (tail && tail->len != last_len) ||
2381                     sk->sk_err ||
2382                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2383                     signal_pending(current) ||
2384                     !timeo)
2385                         break;
2386
2387                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2388                 unix_state_unlock(sk);
2389                 if (freezable)
2390                         timeo = freezable_schedule_timeout(timeo);
2391                 else
2392                         timeo = schedule_timeout(timeo);
2393                 unix_state_lock(sk);
2394
2395                 if (sock_flag(sk, SOCK_DEAD))
2396                         break;
2397
2398                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2399         }
2400
2401         finish_wait(sk_sleep(sk), &wait);
2402         unix_state_unlock(sk);
2403         return timeo;
2404 }
2405
2406 static unsigned int unix_skb_len(const struct sk_buff *skb)
2407 {
2408         return skb->len - UNIXCB(skb).consumed;
2409 }
2410
2411 struct unix_stream_read_state {
2412         int (*recv_actor)(struct sk_buff *, int, int,
2413                           struct unix_stream_read_state *);
2414         struct socket *socket;
2415         struct msghdr *msg;
2416         struct pipe_inode_info *pipe;
2417         size_t size;
2418         int flags;
2419         unsigned int splice_flags;
2420 };
2421
2422 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2423 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2424 {
2425         struct socket *sock = state->socket;
2426         struct sock *sk = sock->sk;
2427         struct unix_sock *u = unix_sk(sk);
2428         int chunk = 1;
2429
2430         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb)
2431                 return -EINVAL;
2432
2433         chunk = state->recv_actor(u->oob_skb, 0, chunk, state);
2434         if (chunk < 0)
2435                 return -EFAULT;
2436
2437         if (!(state->flags & MSG_PEEK)) {
2438                 UNIXCB(u->oob_skb).consumed += 1;
2439                 kfree_skb(u->oob_skb);
2440                 u->oob_skb = NULL;
2441         }
2442         state->msg->msg_flags |= MSG_OOB;
2443         return 1;
2444 }
2445
2446 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2447                                   int flags, int copied)
2448 {
2449         struct unix_sock *u = unix_sk(sk);
2450
2451         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2452                 skb_unlink(skb, &sk->sk_receive_queue);
2453                 consume_skb(skb);
2454                 skb = NULL;
2455         } else {
2456                 if (skb == u->oob_skb) {
2457                         if (copied) {
2458                                 skb = NULL;
2459                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2460                                 if (!(flags & MSG_PEEK)) {
2461                                         u->oob_skb = NULL;
2462                                         consume_skb(skb);
2463                                 }
2464                         } else if (!(flags & MSG_PEEK)) {
2465                                 skb_unlink(skb, &sk->sk_receive_queue);
2466                                 consume_skb(skb);
2467                                 skb = skb_peek(&sk->sk_receive_queue);
2468                         }
2469                 }
2470         }
2471         return skb;
2472 }
2473 #endif
2474
2475 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2476                                     bool freezable)
2477 {
2478         struct scm_cookie scm;
2479         struct socket *sock = state->socket;
2480         struct sock *sk = sock->sk;
2481         struct unix_sock *u = unix_sk(sk);
2482         int copied = 0;
2483         int flags = state->flags;
2484         int noblock = flags & MSG_DONTWAIT;
2485         bool check_creds = false;
2486         int target;
2487         int err = 0;
2488         long timeo;
2489         int skip;
2490         size_t size = state->size;
2491         unsigned int last_len;
2492
2493         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2494                 err = -EINVAL;
2495                 goto out;
2496         }
2497
2498         if (unlikely(flags & MSG_OOB)) {
2499                 err = -EOPNOTSUPP;
2500 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2501                 mutex_lock(&u->iolock);
2502                 unix_state_lock(sk);
2503
2504                 err = unix_stream_recv_urg(state);
2505
2506                 unix_state_unlock(sk);
2507                 mutex_unlock(&u->iolock);
2508 #endif
2509                 goto out;
2510         }
2511
2512         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2513         timeo = sock_rcvtimeo(sk, noblock);
2514
2515         memset(&scm, 0, sizeof(scm));
2516
2517         /* Lock the socket to prevent queue disordering
2518          * while sleeps in memcpy_tomsg
2519          */
2520         mutex_lock(&u->iolock);
2521
2522         skip = max(sk_peek_offset(sk, flags), 0);
2523
2524         do {
2525                 int chunk;
2526                 bool drop_skb;
2527                 struct sk_buff *skb, *last;
2528
2529 redo:
2530                 unix_state_lock(sk);
2531                 if (sock_flag(sk, SOCK_DEAD)) {
2532                         err = -ECONNRESET;
2533                         goto unlock;
2534                 }
2535                 last = skb = skb_peek(&sk->sk_receive_queue);
2536                 last_len = last ? last->len : 0;
2537
2538 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2539                 if (skb) {
2540                         skb = manage_oob(skb, sk, flags, copied);
2541                         if (!skb) {
2542                                 unix_state_unlock(sk);
2543                                 if (copied)
2544                                         break;
2545                                 goto redo;
2546                         }
2547                 }
2548 #endif
2549 again:
2550                 if (skb == NULL) {
2551                         if (copied >= target)
2552                                 goto unlock;
2553
2554                         /*
2555                          *      POSIX 1003.1g mandates this order.
2556                          */
2557
2558                         err = sock_error(sk);
2559                         if (err)
2560                                 goto unlock;
2561                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2562                                 goto unlock;
2563
2564                         unix_state_unlock(sk);
2565                         if (!timeo) {
2566                                 err = -EAGAIN;
2567                                 break;
2568                         }
2569
2570                         mutex_unlock(&u->iolock);
2571
2572                         timeo = unix_stream_data_wait(sk, timeo, last,
2573                                                       last_len, freezable);
2574
2575                         if (signal_pending(current)) {
2576                                 err = sock_intr_errno(timeo);
2577                                 scm_destroy(&scm);
2578                                 goto out;
2579                         }
2580
2581                         mutex_lock(&u->iolock);
2582                         goto redo;
2583 unlock:
2584                         unix_state_unlock(sk);
2585                         break;
2586                 }
2587
2588                 while (skip >= unix_skb_len(skb)) {
2589                         skip -= unix_skb_len(skb);
2590                         last = skb;
2591                         last_len = skb->len;
2592                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2593                         if (!skb)
2594                                 goto again;
2595                 }
2596
2597                 unix_state_unlock(sk);
2598
2599                 if (check_creds) {
2600                         /* Never glue messages from different writers */
2601                         if (!unix_skb_scm_eq(skb, &scm))
2602                                 break;
2603                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2604                         /* Copy credentials */
2605                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2606                         unix_set_secdata(&scm, skb);
2607                         check_creds = true;
2608                 }
2609
2610                 /* Copy address just once */
2611                 if (state->msg && state->msg->msg_name) {
2612                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2613                                          state->msg->msg_name);
2614                         unix_copy_addr(state->msg, skb->sk);
2615                         sunaddr = NULL;
2616                 }
2617
2618                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2619                 skb_get(skb);
2620                 chunk = state->recv_actor(skb, skip, chunk, state);
2621                 drop_skb = !unix_skb_len(skb);
2622                 /* skb is only safe to use if !drop_skb */
2623                 consume_skb(skb);
2624                 if (chunk < 0) {
2625                         if (copied == 0)
2626                                 copied = -EFAULT;
2627                         break;
2628                 }
2629                 copied += chunk;
2630                 size -= chunk;
2631
2632                 if (drop_skb) {
2633                         /* the skb was touched by a concurrent reader;
2634                          * we should not expect anything from this skb
2635                          * anymore and assume it invalid - we can be
2636                          * sure it was dropped from the socket queue
2637                          *
2638                          * let's report a short read
2639                          */
2640                         err = 0;
2641                         break;
2642                 }
2643
2644                 /* Mark read part of skb as used */
2645                 if (!(flags & MSG_PEEK)) {
2646                         UNIXCB(skb).consumed += chunk;
2647
2648                         sk_peek_offset_bwd(sk, chunk);
2649
2650                         if (UNIXCB(skb).fp) {
2651                                 scm_stat_del(sk, skb);
2652                                 unix_detach_fds(&scm, skb);
2653                         }
2654
2655                         if (unix_skb_len(skb))
2656                                 break;
2657
2658                         skb_unlink(skb, &sk->sk_receive_queue);
2659                         consume_skb(skb);
2660
2661                         if (scm.fp)
2662                                 break;
2663                 } else {
2664                         /* It is questionable, see note in unix_dgram_recvmsg.
2665                          */
2666                         if (UNIXCB(skb).fp)
2667                                 unix_peek_fds(&scm, skb);
2668
2669                         sk_peek_offset_fwd(sk, chunk);
2670
2671                         if (UNIXCB(skb).fp)
2672                                 break;
2673
2674                         skip = 0;
2675                         last = skb;
2676                         last_len = skb->len;
2677                         unix_state_lock(sk);
2678                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2679                         if (skb)
2680                                 goto again;
2681                         unix_state_unlock(sk);
2682                         break;
2683                 }
2684         } while (size);
2685
2686         mutex_unlock(&u->iolock);
2687         if (state->msg)
2688                 scm_recv(sock, state->msg, &scm, flags);
2689         else
2690                 scm_destroy(&scm);
2691 out:
2692         return copied ? : err;
2693 }
2694
2695 static int unix_stream_read_actor(struct sk_buff *skb,
2696                                   int skip, int chunk,
2697                                   struct unix_stream_read_state *state)
2698 {
2699         int ret;
2700
2701         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2702                                     state->msg, chunk);
2703         return ret ?: chunk;
2704 }
2705
2706 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2707                                size_t size, int flags)
2708 {
2709         struct unix_stream_read_state state = {
2710                 .recv_actor = unix_stream_read_actor,
2711                 .socket = sock,
2712                 .msg = msg,
2713                 .size = size,
2714                 .flags = flags
2715         };
2716
2717         return unix_stream_read_generic(&state, true);
2718 }
2719
2720 static int unix_stream_splice_actor(struct sk_buff *skb,
2721                                     int skip, int chunk,
2722                                     struct unix_stream_read_state *state)
2723 {
2724         return skb_splice_bits(skb, state->socket->sk,
2725                                UNIXCB(skb).consumed + skip,
2726                                state->pipe, chunk, state->splice_flags);
2727 }
2728
2729 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2730                                        struct pipe_inode_info *pipe,
2731                                        size_t size, unsigned int flags)
2732 {
2733         struct unix_stream_read_state state = {
2734                 .recv_actor = unix_stream_splice_actor,
2735                 .socket = sock,
2736                 .pipe = pipe,
2737                 .size = size,
2738                 .splice_flags = flags,
2739         };
2740
2741         if (unlikely(*ppos))
2742                 return -ESPIPE;
2743
2744         if (sock->file->f_flags & O_NONBLOCK ||
2745             flags & SPLICE_F_NONBLOCK)
2746                 state.flags = MSG_DONTWAIT;
2747
2748         return unix_stream_read_generic(&state, false);
2749 }
2750
2751 static int unix_shutdown(struct socket *sock, int mode)
2752 {
2753         struct sock *sk = sock->sk;
2754         struct sock *other;
2755
2756         if (mode < SHUT_RD || mode > SHUT_RDWR)
2757                 return -EINVAL;
2758         /* This maps:
2759          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2760          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2761          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2762          */
2763         ++mode;
2764
2765         unix_state_lock(sk);
2766         sk->sk_shutdown |= mode;
2767         other = unix_peer(sk);
2768         if (other)
2769                 sock_hold(other);
2770         unix_state_unlock(sk);
2771         sk->sk_state_change(sk);
2772
2773         if (other &&
2774                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2775
2776                 int peer_mode = 0;
2777
2778                 if (mode&RCV_SHUTDOWN)
2779                         peer_mode |= SEND_SHUTDOWN;
2780                 if (mode&SEND_SHUTDOWN)
2781                         peer_mode |= RCV_SHUTDOWN;
2782                 unix_state_lock(other);
2783                 other->sk_shutdown |= peer_mode;
2784                 unix_state_unlock(other);
2785                 other->sk_state_change(other);
2786                 if (peer_mode == SHUTDOWN_MASK)
2787                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2788                 else if (peer_mode & RCV_SHUTDOWN)
2789                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2790         }
2791         if (other)
2792                 sock_put(other);
2793
2794         return 0;
2795 }
2796
2797 long unix_inq_len(struct sock *sk)
2798 {
2799         struct sk_buff *skb;
2800         long amount = 0;
2801
2802         if (sk->sk_state == TCP_LISTEN)
2803                 return -EINVAL;
2804
2805         spin_lock(&sk->sk_receive_queue.lock);
2806         if (sk->sk_type == SOCK_STREAM ||
2807             sk->sk_type == SOCK_SEQPACKET) {
2808                 skb_queue_walk(&sk->sk_receive_queue, skb)
2809                         amount += unix_skb_len(skb);
2810         } else {
2811                 skb = skb_peek(&sk->sk_receive_queue);
2812                 if (skb)
2813                         amount = skb->len;
2814         }
2815         spin_unlock(&sk->sk_receive_queue.lock);
2816
2817         return amount;
2818 }
2819 EXPORT_SYMBOL_GPL(unix_inq_len);
2820
2821 long unix_outq_len(struct sock *sk)
2822 {
2823         return sk_wmem_alloc_get(sk);
2824 }
2825 EXPORT_SYMBOL_GPL(unix_outq_len);
2826
2827 static int unix_open_file(struct sock *sk)
2828 {
2829         struct path path;
2830         struct file *f;
2831         int fd;
2832
2833         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2834                 return -EPERM;
2835
2836         if (!smp_load_acquire(&unix_sk(sk)->addr))
2837                 return -ENOENT;
2838
2839         path = unix_sk(sk)->path;
2840         if (!path.dentry)
2841                 return -ENOENT;
2842
2843         path_get(&path);
2844
2845         fd = get_unused_fd_flags(O_CLOEXEC);
2846         if (fd < 0)
2847                 goto out;
2848
2849         f = dentry_open(&path, O_PATH, current_cred());
2850         if (IS_ERR(f)) {
2851                 put_unused_fd(fd);
2852                 fd = PTR_ERR(f);
2853                 goto out;
2854         }
2855
2856         fd_install(fd, f);
2857 out:
2858         path_put(&path);
2859
2860         return fd;
2861 }
2862
2863 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2864 {
2865         struct sock *sk = sock->sk;
2866         long amount = 0;
2867         int err;
2868
2869         switch (cmd) {
2870         case SIOCOUTQ:
2871                 amount = unix_outq_len(sk);
2872                 err = put_user(amount, (int __user *)arg);
2873                 break;
2874         case SIOCINQ:
2875                 amount = unix_inq_len(sk);
2876                 if (amount < 0)
2877                         err = amount;
2878                 else
2879                         err = put_user(amount, (int __user *)arg);
2880                 break;
2881         case SIOCUNIXFILE:
2882                 err = unix_open_file(sk);
2883                 break;
2884 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2885         case SIOCATMARK:
2886                 {
2887                         struct sk_buff *skb;
2888                         struct unix_sock *u = unix_sk(sk);
2889                         int answ = 0;
2890
2891                         skb = skb_peek(&sk->sk_receive_queue);
2892                         if (skb && skb == u->oob_skb)
2893                                 answ = 1;
2894                         err = put_user(answ, (int __user *)arg);
2895                 }
2896                 break;
2897 #endif
2898         default:
2899                 err = -ENOIOCTLCMD;
2900                 break;
2901         }
2902         return err;
2903 }
2904
2905 #ifdef CONFIG_COMPAT
2906 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2907 {
2908         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2909 }
2910 #endif
2911
2912 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2913 {
2914         struct sock *sk = sock->sk;
2915         __poll_t mask;
2916
2917         sock_poll_wait(file, sock, wait);
2918         mask = 0;
2919
2920         /* exceptional events? */
2921         if (sk->sk_err)
2922                 mask |= EPOLLERR;
2923         if (sk->sk_shutdown == SHUTDOWN_MASK)
2924                 mask |= EPOLLHUP;
2925         if (sk->sk_shutdown & RCV_SHUTDOWN)
2926                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2927
2928         /* readable? */
2929         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2930                 mask |= EPOLLIN | EPOLLRDNORM;
2931
2932         /* Connection-based need to check for termination and startup */
2933         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2934             sk->sk_state == TCP_CLOSE)
2935                 mask |= EPOLLHUP;
2936
2937         /*
2938          * we set writable also when the other side has shut down the
2939          * connection. This prevents stuck sockets.
2940          */
2941         if (unix_writable(sk))
2942                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2943
2944         return mask;
2945 }
2946
2947 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2948                                     poll_table *wait)
2949 {
2950         struct sock *sk = sock->sk, *other;
2951         unsigned int writable;
2952         __poll_t mask;
2953
2954         sock_poll_wait(file, sock, wait);
2955         mask = 0;
2956
2957         /* exceptional events? */
2958         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2959                 mask |= EPOLLERR |
2960                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2961
2962         if (sk->sk_shutdown & RCV_SHUTDOWN)
2963                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2964         if (sk->sk_shutdown == SHUTDOWN_MASK)
2965                 mask |= EPOLLHUP;
2966
2967         /* readable? */
2968         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2969                 mask |= EPOLLIN | EPOLLRDNORM;
2970
2971         /* Connection-based need to check for termination and startup */
2972         if (sk->sk_type == SOCK_SEQPACKET) {
2973                 if (sk->sk_state == TCP_CLOSE)
2974                         mask |= EPOLLHUP;
2975                 /* connection hasn't started yet? */
2976                 if (sk->sk_state == TCP_SYN_SENT)
2977                         return mask;
2978         }
2979
2980         /* No write status requested, avoid expensive OUT tests. */
2981         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2982                 return mask;
2983
2984         writable = unix_writable(sk);
2985         if (writable) {
2986                 unix_state_lock(sk);
2987
2988                 other = unix_peer(sk);
2989                 if (other && unix_peer(other) != sk &&
2990                     unix_recvq_full(other) &&
2991                     unix_dgram_peer_wake_me(sk, other))
2992                         writable = 0;
2993
2994                 unix_state_unlock(sk);
2995         }
2996
2997         if (writable)
2998                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2999         else
3000                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3001
3002         return mask;
3003 }
3004
3005 #ifdef CONFIG_PROC_FS
3006
3007 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3008
3009 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3010 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
3011 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3012
3013 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3014 {
3015         unsigned long offset = get_offset(*pos);
3016         unsigned long bucket = get_bucket(*pos);
3017         struct sock *sk;
3018         unsigned long count = 0;
3019
3020         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
3021                 if (sock_net(sk) != seq_file_net(seq))
3022                         continue;
3023                 if (++count == offset)
3024                         break;
3025         }
3026
3027         return sk;
3028 }
3029
3030 static struct sock *unix_next_socket(struct seq_file *seq,
3031                                      struct sock *sk,
3032                                      loff_t *pos)
3033 {
3034         unsigned long bucket;
3035
3036         while (sk > (struct sock *)SEQ_START_TOKEN) {
3037                 sk = sk_next(sk);
3038                 if (!sk)
3039                         goto next_bucket;
3040                 if (sock_net(sk) == seq_file_net(seq))
3041                         return sk;
3042         }
3043
3044         do {
3045                 sk = unix_from_bucket(seq, pos);
3046                 if (sk)
3047                         return sk;
3048
3049 next_bucket:
3050                 bucket = get_bucket(*pos) + 1;
3051                 *pos = set_bucket_offset(bucket, 1);
3052         } while (bucket < ARRAY_SIZE(unix_socket_table));
3053
3054         return NULL;
3055 }
3056
3057 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3058         __acquires(unix_table_lock)
3059 {
3060         spin_lock(&unix_table_lock);
3061
3062         if (!*pos)
3063                 return SEQ_START_TOKEN;
3064
3065         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
3066                 return NULL;
3067
3068         return unix_next_socket(seq, NULL, pos);
3069 }
3070
3071 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3072 {
3073         ++*pos;
3074         return unix_next_socket(seq, v, pos);
3075 }
3076
3077 static void unix_seq_stop(struct seq_file *seq, void *v)
3078         __releases(unix_table_lock)
3079 {
3080         spin_unlock(&unix_table_lock);
3081 }
3082
3083 static int unix_seq_show(struct seq_file *seq, void *v)
3084 {
3085
3086         if (v == SEQ_START_TOKEN)
3087                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3088                          "Inode Path\n");
3089         else {
3090                 struct sock *s = v;
3091                 struct unix_sock *u = unix_sk(s);
3092                 unix_state_lock(s);
3093
3094                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3095                         s,
3096                         refcount_read(&s->sk_refcnt),
3097                         0,
3098                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3099                         s->sk_type,
3100                         s->sk_socket ?
3101                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3102                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3103                         sock_i_ino(s));
3104
3105                 if (u->addr) {  // under unix_table_lock here
3106                         int i, len;
3107                         seq_putc(seq, ' ');
3108
3109                         i = 0;
3110                         len = u->addr->len - sizeof(short);
3111                         if (!UNIX_ABSTRACT(s))
3112                                 len--;
3113                         else {
3114                                 seq_putc(seq, '@');
3115                                 i++;
3116                         }
3117                         for ( ; i < len; i++)
3118                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3119                                          '@');
3120                 }
3121                 unix_state_unlock(s);
3122                 seq_putc(seq, '\n');
3123         }
3124
3125         return 0;
3126 }
3127
3128 static const struct seq_operations unix_seq_ops = {
3129         .start  = unix_seq_start,
3130         .next   = unix_seq_next,
3131         .stop   = unix_seq_stop,
3132         .show   = unix_seq_show,
3133 };
3134 #endif
3135
3136 static const struct net_proto_family unix_family_ops = {
3137         .family = PF_UNIX,
3138         .create = unix_create,
3139         .owner  = THIS_MODULE,
3140 };
3141
3142
3143 static int __net_init unix_net_init(struct net *net)
3144 {
3145         int error = -ENOMEM;
3146
3147         net->unx.sysctl_max_dgram_qlen = 10;
3148         if (unix_sysctl_register(net))
3149                 goto out;
3150
3151 #ifdef CONFIG_PROC_FS
3152         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3153                         sizeof(struct seq_net_private))) {
3154                 unix_sysctl_unregister(net);
3155                 goto out;
3156         }
3157 #endif
3158         error = 0;
3159 out:
3160         return error;
3161 }
3162
3163 static void __net_exit unix_net_exit(struct net *net)
3164 {
3165         unix_sysctl_unregister(net);
3166         remove_proc_entry("unix", net->proc_net);
3167 }
3168
3169 static struct pernet_operations unix_net_ops = {
3170         .init = unix_net_init,
3171         .exit = unix_net_exit,
3172 };
3173
3174 static int __init af_unix_init(void)
3175 {
3176         int rc = -1;
3177
3178         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3179
3180         rc = proto_register(&unix_proto, 1);
3181         if (rc != 0) {
3182                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3183                 goto out;
3184         }
3185
3186         sock_register(&unix_family_ops);
3187         register_pernet_subsys(&unix_net_ops);
3188         unix_bpf_build_proto();
3189 out:
3190         return rc;
3191 }
3192
3193 static void __exit af_unix_exit(void)
3194 {
3195         sock_unregister(PF_UNIX);
3196         proto_unregister(&unix_proto);
3197         unregister_pernet_subsys(&unix_net_ops);
3198 }
3199
3200 /* Earlier than device_initcall() so that other drivers invoking
3201    request_module() don't end up in a loop when modprobe tries
3202    to use a UNIX socket. But later than subsys_initcall() because
3203    we depend on stuff initialised there */
3204 fs_initcall(af_unix_init);
3205 module_exit(af_unix_exit);
3206
3207 MODULE_LICENSE("GPL");
3208 MODULE_ALIAS_NETPROTO(PF_UNIX);