net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <linux/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/udp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113 #include <linux/user_namespace.h>
 114 #include <linux/static_key.h>
 115 #include <linux/memcontrol.h>
 116 #include <linux/prefetch.h>
 117 #include <linux/compat.h>
 118 #include <linux/mroute.h>
 119 #include <linux/mroute6.h>
 120 #include <linux/icmpv6.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <linux/skbuff_ref.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <net/proto_memory.h>
 132 #include <linux/net_tstamp.h>
 133 #include <net/xfrm.h>
 134 #include <linux/ipsec.h>
 135 #include <net/cls_cgroup.h>
 136 #include <net/netprio_cgroup.h>
 137 #include <linux/sock_diag.h>
 138
 139 #include <linux/filter.h>
 140 #include <net/sock_reuseport.h>
 141 #include <net/bpf_sk_storage.h>
 142
 143 #include <trace/events/sock.h>
 144
 145 #include <net/tcp.h>
 146 #include <net/busy_poll.h>
 147 #include <net/phonet/phonet.h>
 148
 149 #include <linux/ethtool.h>
 150
 151 #include "dev.h"
 152
 153 static DEFINE_MUTEX(proto_list_mutex);
 154 static LIST_HEAD(proto_list);
 155
 156 static void sock_def_write_space_wfree(struct sock *sk);
 157 static void sock_def_write_space(struct sock *sk);
 158
 159 /**
 160  * sk_ns_capable - General socket capability test
 161  * @sk: Socket to use a capability on or through
 162  * @user_ns: The user namespace of the capability to use
 163  * @cap: The capability to use
 164  *
 165  * Test to see if the opener of the socket had when the socket was
 166  * created and the current process has the capability @cap in the user
 167  * namespace @user_ns.
 168  */
 169 bool sk_ns_capable(const struct sock *sk,
 170                    struct user_namespace *user_ns, int cap)
 171 {
 172         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 173                 ns_capable(user_ns, cap);
 174 }
 175 EXPORT_SYMBOL(sk_ns_capable);
 176
 177 /**
 178  * sk_capable - Socket global capability test
 179  * @sk: Socket to use a capability on or through
 180  * @cap: The global capability to use
 181  *
 182  * Test to see if the opener of the socket had when the socket was
 183  * created and the current process has the capability @cap in all user
 184  * namespaces.
 185  */
 186 bool sk_capable(const struct sock *sk, int cap)
 187 {
 188         return sk_ns_capable(sk, &init_user_ns, cap);
 189 }
 190 EXPORT_SYMBOL(sk_capable);
 191
 192 /**
 193  * sk_net_capable - Network namespace socket capability test
 194  * @sk: Socket to use a capability on or through
 195  * @cap: The capability to use
 196  *
 197  * Test to see if the opener of the socket had when the socket was created
 198  * and the current process has the capability @cap over the network namespace
 199  * the socket is a member of.
 200  */
 201 bool sk_net_capable(const struct sock *sk, int cap)
 202 {
 203         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 204 }
 205 EXPORT_SYMBOL(sk_net_capable);
 206
 207 /*
 208  * Each address family might have different locking rules, so we have
 209  * one slock key per address family and separate keys for internal and
 210  * userspace sockets.
 211  */
 212 static struct lock_class_key af_family_keys[AF_MAX];
 213 static struct lock_class_key af_family_kern_keys[AF_MAX];
 214 static struct lock_class_key af_family_slock_keys[AF_MAX];
 215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 216
 217 /*
 218  * Make lock validator output more readable. (we pre-construct these
 219  * strings build-time, so that runtime initialization of socket
 220  * locks is fast):
 221  */
 222
 223 #define _sock_locks(x)                                            \
 224   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 225   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 226   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 227   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 228   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 229   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 230   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 231   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 232   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 233   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 234   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 235   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 236   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 237   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 238   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 239   x "AF_MCTP"  , \
 240   x "AF_MAX"
 241
 242 static const char *const af_family_key_strings[AF_MAX+1] = {
 243         _sock_locks("sk_lock-")
 244 };
 245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("slock-")
 247 };
 248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("clock-")
 250 };
 251
 252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-sk_lock-")
 254 };
 255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-slock-")
 257 };
 258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 259         _sock_locks("k-clock-")
 260 };
 261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("rlock-")
 263 };
 264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 265         _sock_locks("wlock-")
 266 };
 267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 268         _sock_locks("elock-")
 269 };
 270
 271 /*
 272  * sk_callback_lock and sk queues locking rules are per-address-family,
 273  * so split the lock classes by using a per-AF key:
 274  */
 275 static struct lock_class_key af_callback_keys[AF_MAX];
 276 static struct lock_class_key af_rlock_keys[AF_MAX];
 277 static struct lock_class_key af_wlock_keys[AF_MAX];
 278 static struct lock_class_key af_elock_keys[AF_MAX];
 279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 280
 281 /* Run time adjustable parameters. */
 282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 283 EXPORT_SYMBOL(sysctl_wmem_max);
 284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 285 EXPORT_SYMBOL(sysctl_rmem_max);
 286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 288
 289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 291
 292 /**
 293  * sk_set_memalloc - sets %SOCK_MEMALLOC
 294  * @sk: socket to set it on
 295  *
 296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 297  * It's the responsibility of the admin to adjust min_free_kbytes
 298  * to meet the requirements
 299  */
 300 void sk_set_memalloc(struct sock *sk)
 301 {
 302         sock_set_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation |= __GFP_MEMALLOC;
 304         static_branch_inc(&memalloc_socks_key);
 305 }
 306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 307
 308 void sk_clear_memalloc(struct sock *sk)
 309 {
 310         sock_reset_flag(sk, SOCK_MEMALLOC);
 311         sk->sk_allocation &= ~__GFP_MEMALLOC;
 312         static_branch_dec(&memalloc_socks_key);
 313
 314         /*
 315          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 316          * progress of swapping. SOCK_MEMALLOC may be cleared while
 317          * it has rmem allocations due to the last swapfile being deactivated
 318          * but there is a risk that the socket is unusable due to exceeding
 319          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 320          */
 321         sk_mem_reclaim(sk);
 322 }
 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 324
 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 326 {
 327         int ret;
 328         unsigned int noreclaim_flag;
 329
 330         /* these should have been dropped before queueing */
 331         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 332
 333         noreclaim_flag = memalloc_noreclaim_save();
 334         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 335                                  tcp_v6_do_rcv,
 336                                  tcp_v4_do_rcv,
 337                                  sk, skb);
 338         memalloc_noreclaim_restore(noreclaim_flag);
 339
 340         return ret;
 341 }
 342 EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344 void sk_error_report(struct sock *sk)
 345 {
 346         sk->sk_error_report(sk);
 347
 348         switch (sk->sk_family) {
 349         case AF_INET:
 350                 fallthrough;
 351         case AF_INET6:
 352                 trace_inet_sk_error_report(sk);
 353                 break;
 354         default:
 355                 break;
 356         }
 357 }
 358 EXPORT_SYMBOL(sk_error_report);
 359
 360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 361 {
 362         struct __kernel_sock_timeval tv;
 363
 364         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 365                 tv.tv_sec = 0;
 366                 tv.tv_usec = 0;
 367         } else {
 368                 tv.tv_sec = timeo / HZ;
 369                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 370         }
 371
 372         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 373                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 374                 *(struct old_timeval32 *)optval = tv32;
 375                 return sizeof(tv32);
 376         }
 377
 378         if (old_timeval) {
 379                 struct __kernel_old_timeval old_tv;
 380                 old_tv.tv_sec = tv.tv_sec;
 381                 old_tv.tv_usec = tv.tv_usec;
 382                 *(struct __kernel_old_timeval *)optval = old_tv;
 383                 return sizeof(old_tv);
 384         }
 385
 386         *(struct __kernel_sock_timeval *)optval = tv;
 387         return sizeof(tv);
 388 }
 389 EXPORT_SYMBOL(sock_get_timeout);
 390
 391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 392                            sockptr_t optval, int optlen, bool old_timeval)
 393 {
 394         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 395                 struct old_timeval32 tv32;
 396
 397                 if (optlen < sizeof(tv32))
 398                         return -EINVAL;
 399
 400                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 401                         return -EFAULT;
 402                 tv->tv_sec = tv32.tv_sec;
 403                 tv->tv_usec = tv32.tv_usec;
 404         } else if (old_timeval) {
 405                 struct __kernel_old_timeval old_tv;
 406
 407                 if (optlen < sizeof(old_tv))
 408                         return -EINVAL;
 409                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 410                         return -EFAULT;
 411                 tv->tv_sec = old_tv.tv_sec;
 412                 tv->tv_usec = old_tv.tv_usec;
 413         } else {
 414                 if (optlen < sizeof(*tv))
 415                         return -EINVAL;
 416                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 417                         return -EFAULT;
 418         }
 419
 420         return 0;
 421 }
 422 EXPORT_SYMBOL(sock_copy_user_timeval);
 423
 424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 425                             bool old_timeval)
 426 {
 427         struct __kernel_sock_timeval tv;
 428         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 429         long val;
 430
 431         if (err)
 432                 return err;
 433
 434         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 435                 return -EDOM;
 436
 437         if (tv.tv_sec < 0) {
 438                 static int warned __read_mostly;
 439
 440                 WRITE_ONCE(*timeo_p, 0);
 441                 if (warned < 10 && net_ratelimit()) {
 442                         warned++;
 443                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 444                                 __func__, current->comm, task_pid_nr(current));
 445                 }
 446                 return 0;
 447         }
 448         val = MAX_SCHEDULE_TIMEOUT;
 449         if ((tv.tv_sec || tv.tv_usec) &&
 450             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 451                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 452                                                     USEC_PER_SEC / HZ);
 453         WRITE_ONCE(*timeo_p, val);
 454         return 0;
 455 }
 456
 457 static bool sock_needs_netstamp(const struct sock *sk)
 458 {
 459         switch (sk->sk_family) {
 460         case AF_UNSPEC:
 461         case AF_UNIX:
 462                 return false;
 463         default:
 464                 return true;
 465         }
 466 }
 467
 468 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 469 {
 470         if (sk->sk_flags & flags) {
 471                 sk->sk_flags &= ~flags;
 472                 if (sock_needs_netstamp(sk) &&
 473                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 474                         net_disable_timestamp();
 475         }
 476 }
 477
 478
 479 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 480 {
 481         unsigned long flags;
 482         struct sk_buff_head *list = &sk->sk_receive_queue;
 483
 484         if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
 485                 atomic_inc(&sk->sk_drops);
 486                 trace_sock_rcvqueue_full(sk, skb);
 487                 return -ENOMEM;
 488         }
 489
 490         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 491                 atomic_inc(&sk->sk_drops);
 492                 return -ENOBUFS;
 493         }
 494
 495         skb->dev = NULL;
 496         skb_set_owner_r(skb, sk);
 497
 498         /* we escape from rcu protected region, make sure we dont leak
 499          * a norefcounted dst
 500          */
 501         skb_dst_force(skb);
 502
 503         spin_lock_irqsave(&list->lock, flags);
 504         sock_skb_set_dropcount(sk, skb);
 505         __skb_queue_tail(list, skb);
 506         spin_unlock_irqrestore(&list->lock, flags);
 507
 508         if (!sock_flag(sk, SOCK_DEAD))
 509                 sk->sk_data_ready(sk);
 510         return 0;
 511 }
 512 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 513
 514 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 515                               enum skb_drop_reason *reason)
 516 {
 517         enum skb_drop_reason drop_reason;
 518         int err;
 519
 520         err = sk_filter(sk, skb);
 521         if (err) {
 522                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 523                 goto out;
 524         }
 525         err = __sock_queue_rcv_skb(sk, skb);
 526         switch (err) {
 527         case -ENOMEM:
 528                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 529                 break;
 530         case -ENOBUFS:
 531                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 532                 break;
 533         default:
 534                 drop_reason = SKB_NOT_DROPPED_YET;
 535                 break;
 536         }
 537 out:
 538         if (reason)
 539                 *reason = drop_reason;
 540         return err;
 541 }
 542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 543
 544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 545                      const int nested, unsigned int trim_cap, bool refcounted)
 546 {
 547         int rc = NET_RX_SUCCESS;
 548
 549         if (sk_filter_trim_cap(sk, skb, trim_cap))
 550                 goto discard_and_relse;
 551
 552         skb->dev = NULL;
 553
 554         if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
 555                 atomic_inc(&sk->sk_drops);
 556                 goto discard_and_relse;
 557         }
 558         if (nested)
 559                 bh_lock_sock_nested(sk);
 560         else
 561                 bh_lock_sock(sk);
 562         if (!sock_owned_by_user(sk)) {
 563                 /*
 564                  * trylock + unlock semantics:
 565                  */
 566                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 567
 568                 rc = sk_backlog_rcv(sk, skb);
 569
 570                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 571         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 572                 bh_unlock_sock(sk);
 573                 atomic_inc(&sk->sk_drops);
 574                 goto discard_and_relse;
 575         }
 576
 577         bh_unlock_sock(sk);
 578 out:
 579         if (refcounted)
 580                 sock_put(sk);
 581         return rc;
 582 discard_and_relse:
 583         kfree_skb(skb);
 584         goto out;
 585 }
 586 EXPORT_SYMBOL(__sk_receive_skb);
 587
 588 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 589                                                           u32));
 590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 591                                                            u32));
 592 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 593 {
 594         struct dst_entry *dst = __sk_dst_get(sk);
 595
 596         if (dst && dst->obsolete &&
 597             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 598                                dst, cookie) == NULL) {
 599                 sk_tx_queue_clear(sk);
 600                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 601                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 602                 dst_release(dst);
 603                 return NULL;
 604         }
 605
 606         return dst;
 607 }
 608 EXPORT_SYMBOL(__sk_dst_check);
 609
 610 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 611 {
 612         struct dst_entry *dst = sk_dst_get(sk);
 613
 614         if (dst && dst->obsolete &&
 615             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 616                                dst, cookie) == NULL) {
 617                 sk_dst_reset(sk);
 618                 dst_release(dst);
 619                 return NULL;
 620         }
 621
 622         return dst;
 623 }
 624 EXPORT_SYMBOL(sk_dst_check);
 625
 626 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 627 {
 628         int ret = -ENOPROTOOPT;
 629 #ifdef CONFIG_NETDEVICES
 630         struct net *net = sock_net(sk);
 631
 632         /* Sorry... */
 633         ret = -EPERM;
 634         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 635                 goto out;
 636
 637         ret = -EINVAL;
 638         if (ifindex < 0)
 639                 goto out;
 640
 641         /* Paired with all READ_ONCE() done locklessly. */
 642         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 643
 644         if (sk->sk_prot->rehash)
 645                 sk->sk_prot->rehash(sk);
 646         sk_dst_reset(sk);
 647
 648         ret = 0;
 649
 650 out:
 651 #endif
 652
 653         return ret;
 654 }
 655
 656 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 657 {
 658         int ret;
 659
 660         if (lock_sk)
 661                 lock_sock(sk);
 662         ret = sock_bindtoindex_locked(sk, ifindex);
 663         if (lock_sk)
 664                 release_sock(sk);
 665
 666         return ret;
 667 }
 668 EXPORT_SYMBOL(sock_bindtoindex);
 669
 670 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 671 {
 672         int ret = -ENOPROTOOPT;
 673 #ifdef CONFIG_NETDEVICES
 674         struct net *net = sock_net(sk);
 675         char devname[IFNAMSIZ];
 676         int index;
 677
 678         ret = -EINVAL;
 679         if (optlen < 0)
 680                 goto out;
 681
 682         /* Bind this socket to a particular device like "eth0",
 683          * as specified in the passed interface name. If the
 684          * name is "" or the option length is zero the socket
 685          * is not bound.
 686          */
 687         if (optlen > IFNAMSIZ - 1)
 688                 optlen = IFNAMSIZ - 1;
 689         memset(devname, 0, sizeof(devname));
 690
 691         ret = -EFAULT;
 692         if (copy_from_sockptr(devname, optval, optlen))
 693                 goto out;
 694
 695         index = 0;
 696         if (devname[0] != '\0') {
 697                 struct net_device *dev;
 698
 699                 rcu_read_lock();
 700                 dev = dev_get_by_name_rcu(net, devname);
 701                 if (dev)
 702                         index = dev->ifindex;
 703                 rcu_read_unlock();
 704                 ret = -ENODEV;
 705                 if (!dev)
 706                         goto out;
 707         }
 708
 709         sockopt_lock_sock(sk);
 710         ret = sock_bindtoindex_locked(sk, index);
 711         sockopt_release_sock(sk);
 712 out:
 713 #endif
 714
 715         return ret;
 716 }
 717
 718 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 719                                 sockptr_t optlen, int len)
 720 {
 721         int ret = -ENOPROTOOPT;
 722 #ifdef CONFIG_NETDEVICES
 723         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 724         struct net *net = sock_net(sk);
 725         char devname[IFNAMSIZ];
 726
 727         if (bound_dev_if == 0) {
 728                 len = 0;
 729                 goto zero;
 730         }
 731
 732         ret = -EINVAL;
 733         if (len < IFNAMSIZ)
 734                 goto out;
 735
 736         ret = netdev_get_name(net, devname, bound_dev_if);
 737         if (ret)
 738                 goto out;
 739
 740         len = strlen(devname) + 1;
 741
 742         ret = -EFAULT;
 743         if (copy_to_sockptr(optval, devname, len))
 744                 goto out;
 745
 746 zero:
 747         ret = -EFAULT;
 748         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 749                 goto out;
 750
 751         ret = 0;
 752
 753 out:
 754 #endif
 755
 756         return ret;
 757 }
 758
 759 bool sk_mc_loop(const struct sock *sk)
 760 {
 761         if (dev_recursion_level())
 762                 return false;
 763         if (!sk)
 764                 return true;
 765         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 766         switch (READ_ONCE(sk->sk_family)) {
 767         case AF_INET:
 768                 return inet_test_bit(MC_LOOP, sk);
 769 #if IS_ENABLED(CONFIG_IPV6)
 770         case AF_INET6:
 771                 return inet6_test_bit(MC6_LOOP, sk);
 772 #endif
 773         }
 774         WARN_ON_ONCE(1);
 775         return true;
 776 }
 777 EXPORT_SYMBOL(sk_mc_loop);
 778
 779 void sock_set_reuseaddr(struct sock *sk)
 780 {
 781         lock_sock(sk);
 782         sk->sk_reuse = SK_CAN_REUSE;
 783         release_sock(sk);
 784 }
 785 EXPORT_SYMBOL(sock_set_reuseaddr);
 786
 787 void sock_set_reuseport(struct sock *sk)
 788 {
 789         lock_sock(sk);
 790         sk->sk_reuseport = true;
 791         release_sock(sk);
 792 }
 793 EXPORT_SYMBOL(sock_set_reuseport);
 794
 795 void sock_no_linger(struct sock *sk)
 796 {
 797         lock_sock(sk);
 798         WRITE_ONCE(sk->sk_lingertime, 0);
 799         sock_set_flag(sk, SOCK_LINGER);
 800         release_sock(sk);
 801 }
 802 EXPORT_SYMBOL(sock_no_linger);
 803
 804 void sock_set_priority(struct sock *sk, u32 priority)
 805 {
 806         WRITE_ONCE(sk->sk_priority, priority);
 807 }
 808 EXPORT_SYMBOL(sock_set_priority);
 809
 810 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 811 {
 812         lock_sock(sk);
 813         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 814                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 815         else
 816                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 817         release_sock(sk);
 818 }
 819 EXPORT_SYMBOL(sock_set_sndtimeo);
 820
 821 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 822 {
 823         sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
 824         sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
 825         if (val)  {
 826                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 827                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 828         }
 829 }
 830
 831 void sock_enable_timestamps(struct sock *sk)
 832 {
 833         lock_sock(sk);
 834         __sock_set_timestamps(sk, true, false, true);
 835         release_sock(sk);
 836 }
 837 EXPORT_SYMBOL(sock_enable_timestamps);
 838
 839 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 840 {
 841         switch (optname) {
 842         case SO_TIMESTAMP_OLD:
 843                 __sock_set_timestamps(sk, valbool, false, false);
 844                 break;
 845         case SO_TIMESTAMP_NEW:
 846                 __sock_set_timestamps(sk, valbool, true, false);
 847                 break;
 848         case SO_TIMESTAMPNS_OLD:
 849                 __sock_set_timestamps(sk, valbool, false, true);
 850                 break;
 851         case SO_TIMESTAMPNS_NEW:
 852                 __sock_set_timestamps(sk, valbool, true, true);
 853                 break;
 854         }
 855 }
 856
 857 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 858 {
 859         struct net *net = sock_net(sk);
 860         struct net_device *dev = NULL;
 861         bool match = false;
 862         int *vclock_index;
 863         int i, num;
 864
 865         if (sk->sk_bound_dev_if)
 866                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 867
 868         if (!dev) {
 869                 pr_err("%s: sock not bind to device\n", __func__);
 870                 return -EOPNOTSUPP;
 871         }
 872
 873         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 874         dev_put(dev);
 875
 876         for (i = 0; i < num; i++) {
 877                 if (*(vclock_index + i) == phc_index) {
 878                         match = true;
 879                         break;
 880                 }
 881         }
 882
 883         if (num > 0)
 884                 kfree(vclock_index);
 885
 886         if (!match)
 887                 return -EINVAL;
 888
 889         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 890
 891         return 0;
 892 }
 893
 894 int sock_set_timestamping(struct sock *sk, int optname,
 895                           struct so_timestamping timestamping)
 896 {
 897         int val = timestamping.flags;
 898         int ret;
 899
 900         if (val & ~SOF_TIMESTAMPING_MASK)
 901                 return -EINVAL;
 902
 903         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 904             !(val & SOF_TIMESTAMPING_OPT_ID))
 905                 return -EINVAL;
 906
 907         if (val & SOF_TIMESTAMPING_OPT_ID &&
 908             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 909                 if (sk_is_tcp(sk)) {
 910                         if ((1 << sk->sk_state) &
 911                             (TCPF_CLOSE | TCPF_LISTEN))
 912                                 return -EINVAL;
 913                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 914                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 915                         else
 916                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 917                 } else {
 918                         atomic_set(&sk->sk_tskey, 0);
 919                 }
 920         }
 921
 922         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 923             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 924                 return -EINVAL;
 925
 926         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 927                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 928                 if (ret)
 929                         return ret;
 930         }
 931
 932         WRITE_ONCE(sk->sk_tsflags, val);
 933         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 934
 935         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 936                 sock_enable_timestamp(sk,
 937                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 938         else
 939                 sock_disable_timestamp(sk,
 940                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 941         return 0;
 942 }
 943
 944 void sock_set_keepalive(struct sock *sk)
 945 {
 946         lock_sock(sk);
 947         if (sk->sk_prot->keepalive)
 948                 sk->sk_prot->keepalive(sk, true);
 949         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 950         release_sock(sk);
 951 }
 952 EXPORT_SYMBOL(sock_set_keepalive);
 953
 954 static void __sock_set_rcvbuf(struct sock *sk, int val)
 955 {
 956         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 957          * as a negative value.
 958          */
 959         val = min_t(int, val, INT_MAX / 2);
 960         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 961
 962         /* We double it on the way in to account for "struct sk_buff" etc.
 963          * overhead.   Applications assume that the SO_RCVBUF setting they make
 964          * will allow that much actual data to be received on that socket.
 965          *
 966          * Applications are unaware that "struct sk_buff" and other overheads
 967          * allocate from the receive buffer during socket buffer allocation.
 968          *
 969          * And after considering the possible alternatives, returning the value
 970          * we actually used in getsockopt is the most desirable behavior.
 971          */
 972         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 973 }
 974
 975 void sock_set_rcvbuf(struct sock *sk, int val)
 976 {
 977         lock_sock(sk);
 978         __sock_set_rcvbuf(sk, val);
 979         release_sock(sk);
 980 }
 981 EXPORT_SYMBOL(sock_set_rcvbuf);
 982
 983 static void __sock_set_mark(struct sock *sk, u32 val)
 984 {
 985         if (val != sk->sk_mark) {
 986                 WRITE_ONCE(sk->sk_mark, val);
 987                 sk_dst_reset(sk);
 988         }
 989 }
 990
 991 void sock_set_mark(struct sock *sk, u32 val)
 992 {
 993         lock_sock(sk);
 994         __sock_set_mark(sk, val);
 995         release_sock(sk);
 996 }
 997 EXPORT_SYMBOL(sock_set_mark);
 998
 999 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1000 {
1001         /* Round down bytes to multiple of pages */
1002         bytes = round_down(bytes, PAGE_SIZE);
1003
1004         WARN_ON(bytes > sk->sk_reserved_mem);
1005         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1006         sk_mem_reclaim(sk);
1007 }
1008
1009 static int sock_reserve_memory(struct sock *sk, int bytes)
1010 {
1011         long allocated;
1012         bool charged;
1013         int pages;
1014
1015         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1016                 return -EOPNOTSUPP;
1017
1018         if (!bytes)
1019                 return 0;
1020
1021         pages = sk_mem_pages(bytes);
1022
1023         /* pre-charge to memcg */
1024         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1025                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1026         if (!charged)
1027                 return -ENOMEM;
1028
1029         /* pre-charge to forward_alloc */
1030         sk_memory_allocated_add(sk, pages);
1031         allocated = sk_memory_allocated(sk);
1032         /* If the system goes into memory pressure with this
1033          * precharge, give up and return error.
1034          */
1035         if (allocated > sk_prot_mem_limits(sk, 1)) {
1036                 sk_memory_allocated_sub(sk, pages);
1037                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1038                 return -ENOMEM;
1039         }
1040         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1041
1042         WRITE_ONCE(sk->sk_reserved_mem,
1043                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1044
1045         return 0;
1046 }
1047
1048 #ifdef CONFIG_PAGE_POOL
1049
1050 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1051  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1052  * allocates to copy these tokens, and to prevent looping over the frags for
1053  * too long.
1054  */
1055 #define MAX_DONTNEED_TOKENS 128
1056 #define MAX_DONTNEED_FRAGS 1024
1057
1058 static noinline_for_stack int
1059 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1060 {
1061         unsigned int num_tokens, i, j, k, netmem_num = 0;
1062         struct dmabuf_token *tokens;
1063         int ret = 0, num_frags = 0;
1064         netmem_ref netmems[16];
1065
1066         if (!sk_is_tcp(sk))
1067                 return -EBADF;
1068
1069         if (optlen % sizeof(*tokens) ||
1070             optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1071                 return -EINVAL;
1072
1073         num_tokens = optlen / sizeof(*tokens);
1074         tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1075         if (!tokens)
1076                 return -ENOMEM;
1077
1078         if (copy_from_sockptr(tokens, optval, optlen)) {
1079                 kvfree(tokens);
1080                 return -EFAULT;
1081         }
1082
1083         xa_lock_bh(&sk->sk_user_frags);
1084         for (i = 0; i < num_tokens; i++) {
1085                 for (j = 0; j < tokens[i].token_count; j++) {
1086                         if (++num_frags > MAX_DONTNEED_FRAGS)
1087                                 goto frag_limit_reached;
1088
1089                         netmem_ref netmem = (__force netmem_ref)__xa_erase(
1090                                 &sk->sk_user_frags, tokens[i].token_start + j);
1091
1092                         if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1093                                 continue;
1094
1095                         netmems[netmem_num++] = netmem;
1096                         if (netmem_num == ARRAY_SIZE(netmems)) {
1097                                 xa_unlock_bh(&sk->sk_user_frags);
1098                                 for (k = 0; k < netmem_num; k++)
1099                                         WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1100                                 netmem_num = 0;
1101                                 xa_lock_bh(&sk->sk_user_frags);
1102                         }
1103                         ret++;
1104                 }
1105         }
1106
1107 frag_limit_reached:
1108         xa_unlock_bh(&sk->sk_user_frags);
1109         for (k = 0; k < netmem_num; k++)
1110                 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1111
1112         kvfree(tokens);
1113         return ret;
1114 }
1115 #endif
1116
1117 void sockopt_lock_sock(struct sock *sk)
1118 {
1119         /* When current->bpf_ctx is set, the setsockopt is called from
1120          * a bpf prog.  bpf has ensured the sk lock has been
1121          * acquired before calling setsockopt().
1122          */
1123         if (has_current_bpf_ctx())
1124                 return;
1125
1126         lock_sock(sk);
1127 }
1128 EXPORT_SYMBOL(sockopt_lock_sock);
1129
1130 void sockopt_release_sock(struct sock *sk)
1131 {
1132         if (has_current_bpf_ctx())
1133                 return;
1134
1135         release_sock(sk);
1136 }
1137 EXPORT_SYMBOL(sockopt_release_sock);
1138
1139 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1140 {
1141         return has_current_bpf_ctx() || ns_capable(ns, cap);
1142 }
1143 EXPORT_SYMBOL(sockopt_ns_capable);
1144
1145 bool sockopt_capable(int cap)
1146 {
1147         return has_current_bpf_ctx() || capable(cap);
1148 }
1149 EXPORT_SYMBOL(sockopt_capable);
1150
1151 static int sockopt_validate_clockid(__kernel_clockid_t value)
1152 {
1153         switch (value) {
1154         case CLOCK_REALTIME:
1155         case CLOCK_MONOTONIC:
1156         case CLOCK_TAI:
1157                 return 0;
1158         }
1159         return -EINVAL;
1160 }
1161
1162 /*
1163  *      This is meant for all protocols to use and covers goings on
1164  *      at the socket level. Everything here is generic.
1165  */
1166
1167 int sk_setsockopt(struct sock *sk, int level, int optname,
1168                   sockptr_t optval, unsigned int optlen)
1169 {
1170         struct so_timestamping timestamping;
1171         struct socket *sock = sk->sk_socket;
1172         struct sock_txtime sk_txtime;
1173         int val;
1174         int valbool;
1175         struct linger ling;
1176         int ret = 0;
1177
1178         /*
1179          *      Options without arguments
1180          */
1181
1182         if (optname == SO_BINDTODEVICE)
1183                 return sock_setbindtodevice(sk, optval, optlen);
1184
1185         if (optlen < sizeof(int))
1186                 return -EINVAL;
1187
1188         if (copy_from_sockptr(&val, optval, sizeof(val)))
1189                 return -EFAULT;
1190
1191         valbool = val ? 1 : 0;
1192
1193         /* handle options which do not require locking the socket. */
1194         switch (optname) {
1195         case SO_PRIORITY:
1196                 if ((val >= 0 && val <= 6) ||
1197                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1198                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1199                         sock_set_priority(sk, val);
1200                         return 0;
1201                 }
1202                 return -EPERM;
1203         case SO_PASSSEC:
1204                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1205                 return 0;
1206         case SO_PASSCRED:
1207                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1208                 return 0;
1209         case SO_PASSPIDFD:
1210                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1211                 return 0;
1212         case SO_TYPE:
1213         case SO_PROTOCOL:
1214         case SO_DOMAIN:
1215         case SO_ERROR:
1216                 return -ENOPROTOOPT;
1217 #ifdef CONFIG_NET_RX_BUSY_POLL
1218         case SO_BUSY_POLL:
1219                 if (val < 0)
1220                         return -EINVAL;
1221                 WRITE_ONCE(sk->sk_ll_usec, val);
1222                 return 0;
1223         case SO_PREFER_BUSY_POLL:
1224                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1225                         return -EPERM;
1226                 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1227                 return 0;
1228         case SO_BUSY_POLL_BUDGET:
1229                 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1230                     !sockopt_capable(CAP_NET_ADMIN))
1231                         return -EPERM;
1232                 if (val < 0 || val > U16_MAX)
1233                         return -EINVAL;
1234                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1235                 return 0;
1236 #endif
1237         case SO_MAX_PACING_RATE:
1238                 {
1239                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1240                 unsigned long pacing_rate;
1241
1242                 if (sizeof(ulval) != sizeof(val) &&
1243                     optlen >= sizeof(ulval) &&
1244                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1245                         return -EFAULT;
1246                 }
1247                 if (ulval != ~0UL)
1248                         cmpxchg(&sk->sk_pacing_status,
1249                                 SK_PACING_NONE,
1250                                 SK_PACING_NEEDED);
1251                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1252                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1253                 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1254                 if (ulval < pacing_rate)
1255                         WRITE_ONCE(sk->sk_pacing_rate, ulval);
1256                 return 0;
1257                 }
1258         case SO_TXREHASH:
1259                 if (val < -1 || val > 1)
1260                         return -EINVAL;
1261                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1262                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1263                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1264                  * and sk_getsockopt().
1265                  */
1266                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1267                 return 0;
1268         case SO_PEEK_OFF:
1269                 {
1270                 int (*set_peek_off)(struct sock *sk, int val);
1271
1272                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1273                 if (set_peek_off)
1274                         ret = set_peek_off(sk, val);
1275                 else
1276                         ret = -EOPNOTSUPP;
1277                 return ret;
1278                 }
1279 #ifdef CONFIG_PAGE_POOL
1280         case SO_DEVMEM_DONTNEED:
1281                 return sock_devmem_dontneed(sk, optval, optlen);
1282 #endif
1283         }
1284
1285         sockopt_lock_sock(sk);
1286
1287         switch (optname) {
1288         case SO_DEBUG:
1289                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1290                         ret = -EACCES;
1291                 else
1292                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1293                 break;
1294         case SO_REUSEADDR:
1295                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1296                 break;
1297         case SO_REUSEPORT:
1298                 if (valbool && !sk_is_inet(sk))
1299                         ret = -EOPNOTSUPP;
1300                 else
1301                         sk->sk_reuseport = valbool;
1302                 break;
1303         case SO_DONTROUTE:
1304                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1305                 sk_dst_reset(sk);
1306                 break;
1307         case SO_BROADCAST:
1308                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1309                 break;
1310         case SO_SNDBUF:
1311                 /* Don't error on this BSD doesn't and if you think
1312                  * about it this is right. Otherwise apps have to
1313                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1314                  * are treated in BSD as hints
1315                  */
1316                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1317 set_sndbuf:
1318                 /* Ensure val * 2 fits into an int, to prevent max_t()
1319                  * from treating it as a negative value.
1320                  */
1321                 val = min_t(int, val, INT_MAX / 2);
1322                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1323                 WRITE_ONCE(sk->sk_sndbuf,
1324                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1325                 /* Wake up sending tasks if we upped the value. */
1326                 sk->sk_write_space(sk);
1327                 break;
1328
1329         case SO_SNDBUFFORCE:
1330                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1331                         ret = -EPERM;
1332                         break;
1333                 }
1334
1335                 /* No negative values (to prevent underflow, as val will be
1336                  * multiplied by 2).
1337                  */
1338                 if (val < 0)
1339                         val = 0;
1340                 goto set_sndbuf;
1341
1342         case SO_RCVBUF:
1343                 /* Don't error on this BSD doesn't and if you think
1344                  * about it this is right. Otherwise apps have to
1345                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1346                  * are treated in BSD as hints
1347                  */
1348                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1349                 break;
1350
1351         case SO_RCVBUFFORCE:
1352                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1353                         ret = -EPERM;
1354                         break;
1355                 }
1356
1357                 /* No negative values (to prevent underflow, as val will be
1358                  * multiplied by 2).
1359                  */
1360                 __sock_set_rcvbuf(sk, max(val, 0));
1361                 break;
1362
1363         case SO_KEEPALIVE:
1364                 if (sk->sk_prot->keepalive)
1365                         sk->sk_prot->keepalive(sk, valbool);
1366                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1367                 break;
1368
1369         case SO_OOBINLINE:
1370                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1371                 break;
1372
1373         case SO_NO_CHECK:
1374                 sk->sk_no_check_tx = valbool;
1375                 break;
1376
1377         case SO_LINGER:
1378                 if (optlen < sizeof(ling)) {
1379                         ret = -EINVAL;  /* 1003.1g */
1380                         break;
1381                 }
1382                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1383                         ret = -EFAULT;
1384                         break;
1385                 }
1386                 if (!ling.l_onoff) {
1387                         sock_reset_flag(sk, SOCK_LINGER);
1388                 } else {
1389                         unsigned long t_sec = ling.l_linger;
1390
1391                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1392                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1393                         else
1394                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1395                         sock_set_flag(sk, SOCK_LINGER);
1396                 }
1397                 break;
1398
1399         case SO_BSDCOMPAT:
1400                 break;
1401
1402         case SO_TIMESTAMP_OLD:
1403         case SO_TIMESTAMP_NEW:
1404         case SO_TIMESTAMPNS_OLD:
1405         case SO_TIMESTAMPNS_NEW:
1406                 sock_set_timestamp(sk, optname, valbool);
1407                 break;
1408
1409         case SO_TIMESTAMPING_NEW:
1410         case SO_TIMESTAMPING_OLD:
1411                 if (optlen == sizeof(timestamping)) {
1412                         if (copy_from_sockptr(&timestamping, optval,
1413                                               sizeof(timestamping))) {
1414                                 ret = -EFAULT;
1415                                 break;
1416                         }
1417                 } else {
1418                         memset(&timestamping, 0, sizeof(timestamping));
1419                         timestamping.flags = val;
1420                 }
1421                 ret = sock_set_timestamping(sk, optname, timestamping);
1422                 break;
1423
1424         case SO_RCVLOWAT:
1425                 {
1426                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1427
1428                 if (val < 0)
1429                         val = INT_MAX;
1430                 if (sock)
1431                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1432                 if (set_rcvlowat)
1433                         ret = set_rcvlowat(sk, val);
1434                 else
1435                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1436                 break;
1437                 }
1438         case SO_RCVTIMEO_OLD:
1439         case SO_RCVTIMEO_NEW:
1440                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1441                                        optlen, optname == SO_RCVTIMEO_OLD);
1442                 break;
1443
1444         case SO_SNDTIMEO_OLD:
1445         case SO_SNDTIMEO_NEW:
1446                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1447                                        optlen, optname == SO_SNDTIMEO_OLD);
1448                 break;
1449
1450         case SO_ATTACH_FILTER: {
1451                 struct sock_fprog fprog;
1452
1453                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1454                 if (!ret)
1455                         ret = sk_attach_filter(&fprog, sk);
1456                 break;
1457         }
1458         case SO_ATTACH_BPF:
1459                 ret = -EINVAL;
1460                 if (optlen == sizeof(u32)) {
1461                         u32 ufd;
1462
1463                         ret = -EFAULT;
1464                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1465                                 break;
1466
1467                         ret = sk_attach_bpf(ufd, sk);
1468                 }
1469                 break;
1470
1471         case SO_ATTACH_REUSEPORT_CBPF: {
1472                 struct sock_fprog fprog;
1473
1474                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1475                 if (!ret)
1476                         ret = sk_reuseport_attach_filter(&fprog, sk);
1477                 break;
1478         }
1479         case SO_ATTACH_REUSEPORT_EBPF:
1480                 ret = -EINVAL;
1481                 if (optlen == sizeof(u32)) {
1482                         u32 ufd;
1483
1484                         ret = -EFAULT;
1485                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1486                                 break;
1487
1488                         ret = sk_reuseport_attach_bpf(ufd, sk);
1489                 }
1490                 break;
1491
1492         case SO_DETACH_REUSEPORT_BPF:
1493                 ret = reuseport_detach_prog(sk);
1494                 break;
1495
1496         case SO_DETACH_FILTER:
1497                 ret = sk_detach_filter(sk);
1498                 break;
1499
1500         case SO_LOCK_FILTER:
1501                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1502                         ret = -EPERM;
1503                 else
1504                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1505                 break;
1506
1507         case SO_MARK:
1508                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1509                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1510                         ret = -EPERM;
1511                         break;
1512                 }
1513
1514                 __sock_set_mark(sk, val);
1515                 break;
1516         case SO_RCVMARK:
1517                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1518                 break;
1519
1520         case SO_RXQ_OVFL:
1521                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1522                 break;
1523
1524         case SO_WIFI_STATUS:
1525                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1526                 break;
1527
1528         case SO_NOFCS:
1529                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1530                 break;
1531
1532         case SO_SELECT_ERR_QUEUE:
1533                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1534                 break;
1535
1536
1537         case SO_INCOMING_CPU:
1538                 reuseport_update_incoming_cpu(sk, val);
1539                 break;
1540
1541         case SO_CNX_ADVICE:
1542                 if (val == 1)
1543                         dst_negative_advice(sk);
1544                 break;
1545
1546         case SO_ZEROCOPY:
1547                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1548                         if (!(sk_is_tcp(sk) ||
1549                               (sk->sk_type == SOCK_DGRAM &&
1550                                sk->sk_protocol == IPPROTO_UDP)))
1551                                 ret = -EOPNOTSUPP;
1552                 } else if (sk->sk_family != PF_RDS) {
1553                         ret = -EOPNOTSUPP;
1554                 }
1555                 if (!ret) {
1556                         if (val < 0 || val > 1)
1557                                 ret = -EINVAL;
1558                         else
1559                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1560                 }
1561                 break;
1562
1563         case SO_TXTIME:
1564                 if (optlen != sizeof(struct sock_txtime)) {
1565                         ret = -EINVAL;
1566                         break;
1567                 } else if (copy_from_sockptr(&sk_txtime, optval,
1568                            sizeof(struct sock_txtime))) {
1569                         ret = -EFAULT;
1570                         break;
1571                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1572                         ret = -EINVAL;
1573                         break;
1574                 }
1575                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1576                  * scheduler has enough safe guards.
1577                  */
1578                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1579                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1580                         ret = -EPERM;
1581                         break;
1582                 }
1583
1584                 ret = sockopt_validate_clockid(sk_txtime.clockid);
1585                 if (ret)
1586                         break;
1587
1588                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1589                 sk->sk_clockid = sk_txtime.clockid;
1590                 sk->sk_txtime_deadline_mode =
1591                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1592                 sk->sk_txtime_report_errors =
1593                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1594                 break;
1595
1596         case SO_BINDTOIFINDEX:
1597                 ret = sock_bindtoindex_locked(sk, val);
1598                 break;
1599
1600         case SO_BUF_LOCK:
1601                 if (val & ~SOCK_BUF_LOCK_MASK) {
1602                         ret = -EINVAL;
1603                         break;
1604                 }
1605                 sk->sk_userlocks = val | (sk->sk_userlocks &
1606                                           ~SOCK_BUF_LOCK_MASK);
1607                 break;
1608
1609         case SO_RESERVE_MEM:
1610         {
1611                 int delta;
1612
1613                 if (val < 0) {
1614                         ret = -EINVAL;
1615                         break;
1616                 }
1617
1618                 delta = val - sk->sk_reserved_mem;
1619                 if (delta < 0)
1620                         sock_release_reserved_memory(sk, -delta);
1621                 else
1622                         ret = sock_reserve_memory(sk, delta);
1623                 break;
1624         }
1625
1626         default:
1627                 ret = -ENOPROTOOPT;
1628                 break;
1629         }
1630         sockopt_release_sock(sk);
1631         return ret;
1632 }
1633
1634 int sock_setsockopt(struct socket *sock, int level, int optname,
1635                     sockptr_t optval, unsigned int optlen)
1636 {
1637         return sk_setsockopt(sock->sk, level, optname,
1638                              optval, optlen);
1639 }
1640 EXPORT_SYMBOL(sock_setsockopt);
1641
1642 static const struct cred *sk_get_peer_cred(struct sock *sk)
1643 {
1644         const struct cred *cred;
1645
1646         spin_lock(&sk->sk_peer_lock);
1647         cred = get_cred(sk->sk_peer_cred);
1648         spin_unlock(&sk->sk_peer_lock);
1649
1650         return cred;
1651 }
1652
1653 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1654                           struct ucred *ucred)
1655 {
1656         ucred->pid = pid_vnr(pid);
1657         ucred->uid = ucred->gid = -1;
1658         if (cred) {
1659                 struct user_namespace *current_ns = current_user_ns();
1660
1661                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1662                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1663         }
1664 }
1665
1666 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1667 {
1668         struct user_namespace *user_ns = current_user_ns();
1669         int i;
1670
1671         for (i = 0; i < src->ngroups; i++) {
1672                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1673
1674                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1675                         return -EFAULT;
1676         }
1677
1678         return 0;
1679 }
1680
1681 int sk_getsockopt(struct sock *sk, int level, int optname,
1682                   sockptr_t optval, sockptr_t optlen)
1683 {
1684         struct socket *sock = sk->sk_socket;
1685
1686         union {
1687                 int val;
1688                 u64 val64;
1689                 unsigned long ulval;
1690                 struct linger ling;
1691                 struct old_timeval32 tm32;
1692                 struct __kernel_old_timeval tm;
1693                 struct  __kernel_sock_timeval stm;
1694                 struct sock_txtime txtime;
1695                 struct so_timestamping timestamping;
1696         } v;
1697
1698         int lv = sizeof(int);
1699         int len;
1700
1701         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1702                 return -EFAULT;
1703         if (len < 0)
1704                 return -EINVAL;
1705
1706         memset(&v, 0, sizeof(v));
1707
1708         switch (optname) {
1709         case SO_DEBUG:
1710                 v.val = sock_flag(sk, SOCK_DBG);
1711                 break;
1712
1713         case SO_DONTROUTE:
1714                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1715                 break;
1716
1717         case SO_BROADCAST:
1718                 v.val = sock_flag(sk, SOCK_BROADCAST);
1719                 break;
1720
1721         case SO_SNDBUF:
1722                 v.val = READ_ONCE(sk->sk_sndbuf);
1723                 break;
1724
1725         case SO_RCVBUF:
1726                 v.val = READ_ONCE(sk->sk_rcvbuf);
1727                 break;
1728
1729         case SO_REUSEADDR:
1730                 v.val = sk->sk_reuse;
1731                 break;
1732
1733         case SO_REUSEPORT:
1734                 v.val = sk->sk_reuseport;
1735                 break;
1736
1737         case SO_KEEPALIVE:
1738                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1739                 break;
1740
1741         case SO_TYPE:
1742                 v.val = sk->sk_type;
1743                 break;
1744
1745         case SO_PROTOCOL:
1746                 v.val = sk->sk_protocol;
1747                 break;
1748
1749         case SO_DOMAIN:
1750                 v.val = sk->sk_family;
1751                 break;
1752
1753         case SO_ERROR:
1754                 v.val = -sock_error(sk);
1755                 if (v.val == 0)
1756                         v.val = xchg(&sk->sk_err_soft, 0);
1757                 break;
1758
1759         case SO_OOBINLINE:
1760                 v.val = sock_flag(sk, SOCK_URGINLINE);
1761                 break;
1762
1763         case SO_NO_CHECK:
1764                 v.val = sk->sk_no_check_tx;
1765                 break;
1766
1767         case SO_PRIORITY:
1768                 v.val = READ_ONCE(sk->sk_priority);
1769                 break;
1770
1771         case SO_LINGER:
1772                 lv              = sizeof(v.ling);
1773                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1774                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1775                 break;
1776
1777         case SO_BSDCOMPAT:
1778                 break;
1779
1780         case SO_TIMESTAMP_OLD:
1781                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1782                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1783                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1784                 break;
1785
1786         case SO_TIMESTAMPNS_OLD:
1787                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1788                 break;
1789
1790         case SO_TIMESTAMP_NEW:
1791                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1792                 break;
1793
1794         case SO_TIMESTAMPNS_NEW:
1795                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1796                 break;
1797
1798         case SO_TIMESTAMPING_OLD:
1799         case SO_TIMESTAMPING_NEW:
1800                 lv = sizeof(v.timestamping);
1801                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1802                  * returning the flags when they were set through the same option.
1803                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1804                  */
1805                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1806                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1807                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1808                 }
1809                 break;
1810
1811         case SO_RCVTIMEO_OLD:
1812         case SO_RCVTIMEO_NEW:
1813                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1814                                       SO_RCVTIMEO_OLD == optname);
1815                 break;
1816
1817         case SO_SNDTIMEO_OLD:
1818         case SO_SNDTIMEO_NEW:
1819                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1820                                       SO_SNDTIMEO_OLD == optname);
1821                 break;
1822
1823         case SO_RCVLOWAT:
1824                 v.val = READ_ONCE(sk->sk_rcvlowat);
1825                 break;
1826
1827         case SO_SNDLOWAT:
1828                 v.val = 1;
1829                 break;
1830
1831         case SO_PASSCRED:
1832                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1833                 break;
1834
1835         case SO_PASSPIDFD:
1836                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1837                 break;
1838
1839         case SO_PEERCRED:
1840         {
1841                 struct ucred peercred;
1842                 if (len > sizeof(peercred))
1843                         len = sizeof(peercred);
1844
1845                 spin_lock(&sk->sk_peer_lock);
1846                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1847                 spin_unlock(&sk->sk_peer_lock);
1848
1849                 if (copy_to_sockptr(optval, &peercred, len))
1850                         return -EFAULT;
1851                 goto lenout;
1852         }
1853
1854         case SO_PEERPIDFD:
1855         {
1856                 struct pid *peer_pid;
1857                 struct file *pidfd_file = NULL;
1858                 int pidfd;
1859
1860                 if (len > sizeof(pidfd))
1861                         len = sizeof(pidfd);
1862
1863                 spin_lock(&sk->sk_peer_lock);
1864                 peer_pid = get_pid(sk->sk_peer_pid);
1865                 spin_unlock(&sk->sk_peer_lock);
1866
1867                 if (!peer_pid)
1868                         return -ENODATA;
1869
1870                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1871                 put_pid(peer_pid);
1872                 if (pidfd < 0)
1873                         return pidfd;
1874
1875                 if (copy_to_sockptr(optval, &pidfd, len) ||
1876                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1877                         put_unused_fd(pidfd);
1878                         fput(pidfd_file);
1879
1880                         return -EFAULT;
1881                 }
1882
1883                 fd_install(pidfd, pidfd_file);
1884                 return 0;
1885         }
1886
1887         case SO_PEERGROUPS:
1888         {
1889                 const struct cred *cred;
1890                 int ret, n;
1891
1892                 cred = sk_get_peer_cred(sk);
1893                 if (!cred)
1894                         return -ENODATA;
1895
1896                 n = cred->group_info->ngroups;
1897                 if (len < n * sizeof(gid_t)) {
1898                         len = n * sizeof(gid_t);
1899                         put_cred(cred);
1900                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1901                 }
1902                 len = n * sizeof(gid_t);
1903
1904                 ret = groups_to_user(optval, cred->group_info);
1905                 put_cred(cred);
1906                 if (ret)
1907                         return ret;
1908                 goto lenout;
1909         }
1910
1911         case SO_PEERNAME:
1912         {
1913                 struct sockaddr_storage address;
1914
1915                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1916                 if (lv < 0)
1917                         return -ENOTCONN;
1918                 if (lv < len)
1919                         return -EINVAL;
1920                 if (copy_to_sockptr(optval, &address, len))
1921                         return -EFAULT;
1922                 goto lenout;
1923         }
1924
1925         /* Dubious BSD thing... Probably nobody even uses it, but
1926          * the UNIX standard wants it for whatever reason... -DaveM
1927          */
1928         case SO_ACCEPTCONN:
1929                 v.val = sk->sk_state == TCP_LISTEN;
1930                 break;
1931
1932         case SO_PASSSEC:
1933                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1934                 break;
1935
1936         case SO_PEERSEC:
1937                 return security_socket_getpeersec_stream(sock,
1938                                                          optval, optlen, len);
1939
1940         case SO_MARK:
1941                 v.val = READ_ONCE(sk->sk_mark);
1942                 break;
1943
1944         case SO_RCVMARK:
1945                 v.val = sock_flag(sk, SOCK_RCVMARK);
1946                 break;
1947
1948         case SO_RXQ_OVFL:
1949                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1950                 break;
1951
1952         case SO_WIFI_STATUS:
1953                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1954                 break;
1955
1956         case SO_PEEK_OFF:
1957                 if (!READ_ONCE(sock->ops)->set_peek_off)
1958                         return -EOPNOTSUPP;
1959
1960                 v.val = READ_ONCE(sk->sk_peek_off);
1961                 break;
1962         case SO_NOFCS:
1963                 v.val = sock_flag(sk, SOCK_NOFCS);
1964                 break;
1965
1966         case SO_BINDTODEVICE:
1967                 return sock_getbindtodevice(sk, optval, optlen, len);
1968
1969         case SO_GET_FILTER:
1970                 len = sk_get_filter(sk, optval, len);
1971                 if (len < 0)
1972                         return len;
1973
1974                 goto lenout;
1975
1976         case SO_LOCK_FILTER:
1977                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1978                 break;
1979
1980         case SO_BPF_EXTENSIONS:
1981                 v.val = bpf_tell_extensions();
1982                 break;
1983
1984         case SO_SELECT_ERR_QUEUE:
1985                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1986                 break;
1987
1988 #ifdef CONFIG_NET_RX_BUSY_POLL
1989         case SO_BUSY_POLL:
1990                 v.val = READ_ONCE(sk->sk_ll_usec);
1991                 break;
1992         case SO_PREFER_BUSY_POLL:
1993                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1994                 break;
1995 #endif
1996
1997         case SO_MAX_PACING_RATE:
1998                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1999                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2000                         lv = sizeof(v.ulval);
2001                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2002                 } else {
2003                         /* 32bit version */
2004                         v.val = min_t(unsigned long, ~0U,
2005                                       READ_ONCE(sk->sk_max_pacing_rate));
2006                 }
2007                 break;
2008
2009         case SO_INCOMING_CPU:
2010                 v.val = READ_ONCE(sk->sk_incoming_cpu);
2011                 break;
2012
2013         case SO_MEMINFO:
2014         {
2015                 u32 meminfo[SK_MEMINFO_VARS];
2016
2017                 sk_get_meminfo(sk, meminfo);
2018
2019                 len = min_t(unsigned int, len, sizeof(meminfo));
2020                 if (copy_to_sockptr(optval, &meminfo, len))
2021                         return -EFAULT;
2022
2023                 goto lenout;
2024         }
2025
2026 #ifdef CONFIG_NET_RX_BUSY_POLL
2027         case SO_INCOMING_NAPI_ID:
2028                 v.val = READ_ONCE(sk->sk_napi_id);
2029
2030                 /* aggregate non-NAPI IDs down to 0 */
2031                 if (v.val < MIN_NAPI_ID)
2032                         v.val = 0;
2033
2034                 break;
2035 #endif
2036
2037         case SO_COOKIE:
2038                 lv = sizeof(u64);
2039                 if (len < lv)
2040                         return -EINVAL;
2041                 v.val64 = sock_gen_cookie(sk);
2042                 break;
2043
2044         case SO_ZEROCOPY:
2045                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
2046                 break;
2047
2048         case SO_TXTIME:
2049                 lv = sizeof(v.txtime);
2050                 v.txtime.clockid = sk->sk_clockid;
2051                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2052                                   SOF_TXTIME_DEADLINE_MODE : 0;
2053                 v.txtime.flags |= sk->sk_txtime_report_errors ?
2054                                   SOF_TXTIME_REPORT_ERRORS : 0;
2055                 break;
2056
2057         case SO_BINDTOIFINDEX:
2058                 v.val = READ_ONCE(sk->sk_bound_dev_if);
2059                 break;
2060
2061         case SO_NETNS_COOKIE:
2062                 lv = sizeof(u64);
2063                 if (len != lv)
2064                         return -EINVAL;
2065                 v.val64 = sock_net(sk)->net_cookie;
2066                 break;
2067
2068         case SO_BUF_LOCK:
2069                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2070                 break;
2071
2072         case SO_RESERVE_MEM:
2073                 v.val = READ_ONCE(sk->sk_reserved_mem);
2074                 break;
2075
2076         case SO_TXREHASH:
2077                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2078                 v.val = READ_ONCE(sk->sk_txrehash);
2079                 break;
2080
2081         default:
2082                 /* We implement the SO_SNDLOWAT etc to not be settable
2083                  * (1003.1g 7).
2084                  */
2085                 return -ENOPROTOOPT;
2086         }
2087
2088         if (len > lv)
2089                 len = lv;
2090         if (copy_to_sockptr(optval, &v, len))
2091                 return -EFAULT;
2092 lenout:
2093         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2094                 return -EFAULT;
2095         return 0;
2096 }
2097
2098 /*
2099  * Initialize an sk_lock.
2100  *
2101  * (We also register the sk_lock with the lock validator.)
2102  */
2103 static inline void sock_lock_init(struct sock *sk)
2104 {
2105         if (sk->sk_kern_sock)
2106                 sock_lock_init_class_and_name(
2107                         sk,
2108                         af_family_kern_slock_key_strings[sk->sk_family],
2109                         af_family_kern_slock_keys + sk->sk_family,
2110                         af_family_kern_key_strings[sk->sk_family],
2111                         af_family_kern_keys + sk->sk_family);
2112         else
2113                 sock_lock_init_class_and_name(
2114                         sk,
2115                         af_family_slock_key_strings[sk->sk_family],
2116                         af_family_slock_keys + sk->sk_family,
2117                         af_family_key_strings[sk->sk_family],
2118                         af_family_keys + sk->sk_family);
2119 }
2120
2121 /*
2122  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2123  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2124  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2125  */
2126 static void sock_copy(struct sock *nsk, const struct sock *osk)
2127 {
2128         const struct proto *prot = READ_ONCE(osk->sk_prot);
2129 #ifdef CONFIG_SECURITY_NETWORK
2130         void *sptr = nsk->sk_security;
2131 #endif
2132
2133         /* If we move sk_tx_queue_mapping out of the private section,
2134          * we must check if sk_tx_queue_clear() is called after
2135          * sock_copy() in sk_clone_lock().
2136          */
2137         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2138                      offsetof(struct sock, sk_dontcopy_begin) ||
2139                      offsetof(struct sock, sk_tx_queue_mapping) >=
2140                      offsetof(struct sock, sk_dontcopy_end));
2141
2142         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2143
2144         unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2145                       prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2146                       /* alloc is larger than struct, see sk_prot_alloc() */);
2147
2148 #ifdef CONFIG_SECURITY_NETWORK
2149         nsk->sk_security = sptr;
2150         security_sk_clone(osk, nsk);
2151 #endif
2152 }
2153
2154 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2155                 int family)
2156 {
2157         struct sock *sk;
2158         struct kmem_cache *slab;
2159
2160         slab = prot->slab;
2161         if (slab != NULL) {
2162                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2163                 if (!sk)
2164                         return sk;
2165                 if (want_init_on_alloc(priority))
2166                         sk_prot_clear_nulls(sk, prot->obj_size);
2167         } else
2168                 sk = kmalloc(prot->obj_size, priority);
2169
2170         if (sk != NULL) {
2171                 if (security_sk_alloc(sk, family, priority))
2172                         goto out_free;
2173
2174                 if (!try_module_get(prot->owner))
2175                         goto out_free_sec;
2176         }
2177
2178         return sk;
2179
2180 out_free_sec:
2181         security_sk_free(sk);
2182 out_free:
2183         if (slab != NULL)
2184                 kmem_cache_free(slab, sk);
2185         else
2186                 kfree(sk);
2187         return NULL;
2188 }
2189
2190 static void sk_prot_free(struct proto *prot, struct sock *sk)
2191 {
2192         struct kmem_cache *slab;
2193         struct module *owner;
2194
2195         owner = prot->owner;
2196         slab = prot->slab;
2197
2198         cgroup_sk_free(&sk->sk_cgrp_data);
2199         mem_cgroup_sk_free(sk);
2200         security_sk_free(sk);
2201         if (slab != NULL)
2202                 kmem_cache_free(slab, sk);
2203         else
2204                 kfree(sk);
2205         module_put(owner);
2206 }
2207
2208 /**
2209  *      sk_alloc - All socket objects are allocated here
2210  *      @net: the applicable net namespace
2211  *      @family: protocol family
2212  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2213  *      @prot: struct proto associated with this new sock instance
2214  *      @kern: is this to be a kernel socket?
2215  */
2216 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2217                       struct proto *prot, int kern)
2218 {
2219         struct sock *sk;
2220
2221         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2222         if (sk) {
2223                 sk->sk_family = family;
2224                 /*
2225                  * See comment in struct sock definition to understand
2226                  * why we need sk_prot_creator -acme
2227                  */
2228                 sk->sk_prot = sk->sk_prot_creator = prot;
2229                 sk->sk_kern_sock = kern;
2230                 sock_lock_init(sk);
2231                 sk->sk_net_refcnt = kern ? 0 : 1;
2232                 if (likely(sk->sk_net_refcnt)) {
2233                         get_net_track(net, &sk->ns_tracker, priority);
2234                         sock_inuse_add(net, 1);
2235                 } else {
2236                         __netns_tracker_alloc(net, &sk->ns_tracker,
2237                                               false, priority);
2238                 }
2239
2240                 sock_net_set(sk, net);
2241                 refcount_set(&sk->sk_wmem_alloc, 1);
2242
2243                 mem_cgroup_sk_alloc(sk);
2244                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2245                 sock_update_classid(&sk->sk_cgrp_data);
2246                 sock_update_netprioidx(&sk->sk_cgrp_data);
2247                 sk_tx_queue_clear(sk);
2248         }
2249
2250         return sk;
2251 }
2252 EXPORT_SYMBOL(sk_alloc);
2253
2254 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2255  * grace period. This is the case for UDP sockets and TCP listeners.
2256  */
2257 static void __sk_destruct(struct rcu_head *head)
2258 {
2259         struct sock *sk = container_of(head, struct sock, sk_rcu);
2260         struct sk_filter *filter;
2261
2262         if (sk->sk_destruct)
2263                 sk->sk_destruct(sk);
2264
2265         filter = rcu_dereference_check(sk->sk_filter,
2266                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2267         if (filter) {
2268                 sk_filter_uncharge(sk, filter);
2269                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2270         }
2271
2272         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2273
2274 #ifdef CONFIG_BPF_SYSCALL
2275         bpf_sk_storage_free(sk);
2276 #endif
2277
2278         if (atomic_read(&sk->sk_omem_alloc))
2279                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2280                          __func__, atomic_read(&sk->sk_omem_alloc));
2281
2282         if (sk->sk_frag.page) {
2283                 put_page(sk->sk_frag.page);
2284                 sk->sk_frag.page = NULL;
2285         }
2286
2287         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2288         put_cred(sk->sk_peer_cred);
2289         put_pid(sk->sk_peer_pid);
2290
2291         if (likely(sk->sk_net_refcnt))
2292                 put_net_track(sock_net(sk), &sk->ns_tracker);
2293         else
2294                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2295
2296         sk_prot_free(sk->sk_prot_creator, sk);
2297 }
2298
2299 void sk_destruct(struct sock *sk)
2300 {
2301         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2302
2303         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2304                 reuseport_detach_sock(sk);
2305                 use_call_rcu = true;
2306         }
2307
2308         if (use_call_rcu)
2309                 call_rcu(&sk->sk_rcu, __sk_destruct);
2310         else
2311                 __sk_destruct(&sk->sk_rcu);
2312 }
2313
2314 static void __sk_free(struct sock *sk)
2315 {
2316         if (likely(sk->sk_net_refcnt))
2317                 sock_inuse_add(sock_net(sk), -1);
2318
2319         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2320                 sock_diag_broadcast_destroy(sk);
2321         else
2322                 sk_destruct(sk);
2323 }
2324
2325 void sk_free(struct sock *sk)
2326 {
2327         /*
2328          * We subtract one from sk_wmem_alloc and can know if
2329          * some packets are still in some tx queue.
2330          * If not null, sock_wfree() will call __sk_free(sk) later
2331          */
2332         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2333                 __sk_free(sk);
2334 }
2335 EXPORT_SYMBOL(sk_free);
2336
2337 static void sk_init_common(struct sock *sk)
2338 {
2339         skb_queue_head_init(&sk->sk_receive_queue);
2340         skb_queue_head_init(&sk->sk_write_queue);
2341         skb_queue_head_init(&sk->sk_error_queue);
2342
2343         rwlock_init(&sk->sk_callback_lock);
2344         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2345                         af_rlock_keys + sk->sk_family,
2346                         af_family_rlock_key_strings[sk->sk_family]);
2347         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2348                         af_wlock_keys + sk->sk_family,
2349                         af_family_wlock_key_strings[sk->sk_family]);
2350         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2351                         af_elock_keys + sk->sk_family,
2352                         af_family_elock_key_strings[sk->sk_family]);
2353         if (sk->sk_kern_sock)
2354                 lockdep_set_class_and_name(&sk->sk_callback_lock,
2355                         af_kern_callback_keys + sk->sk_family,
2356                         af_family_kern_clock_key_strings[sk->sk_family]);
2357         else
2358                 lockdep_set_class_and_name(&sk->sk_callback_lock,
2359                         af_callback_keys + sk->sk_family,
2360                         af_family_clock_key_strings[sk->sk_family]);
2361 }
2362
2363 /**
2364  *      sk_clone_lock - clone a socket, and lock its clone
2365  *      @sk: the socket to clone
2366  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2367  *
2368  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2369  */
2370 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2371 {
2372         struct proto *prot = READ_ONCE(sk->sk_prot);
2373         struct sk_filter *filter;
2374         bool is_charged = true;
2375         struct sock *newsk;
2376
2377         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2378         if (!newsk)
2379                 goto out;
2380
2381         sock_copy(newsk, sk);
2382
2383         newsk->sk_prot_creator = prot;
2384
2385         /* SANITY */
2386         if (likely(newsk->sk_net_refcnt)) {
2387                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2388                 sock_inuse_add(sock_net(newsk), 1);
2389         } else {
2390                 /* Kernel sockets are not elevating the struct net refcount.
2391                  * Instead, use a tracker to more easily detect if a layer
2392                  * is not properly dismantling its kernel sockets at netns
2393                  * destroy time.
2394                  */
2395                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2396                                       false, priority);
2397         }
2398         sk_node_init(&newsk->sk_node);
2399         sock_lock_init(newsk);
2400         bh_lock_sock(newsk);
2401         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2402         newsk->sk_backlog.len = 0;
2403
2404         atomic_set(&newsk->sk_rmem_alloc, 0);
2405
2406         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2407         refcount_set(&newsk->sk_wmem_alloc, 1);
2408
2409         atomic_set(&newsk->sk_omem_alloc, 0);
2410         sk_init_common(newsk);
2411
2412         newsk->sk_dst_cache     = NULL;
2413         newsk->sk_dst_pending_confirm = 0;
2414         newsk->sk_wmem_queued   = 0;
2415         newsk->sk_forward_alloc = 0;
2416         newsk->sk_reserved_mem  = 0;
2417         atomic_set(&newsk->sk_drops, 0);
2418         newsk->sk_send_head     = NULL;
2419         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2420         atomic_set(&newsk->sk_zckey, 0);
2421
2422         sock_reset_flag(newsk, SOCK_DONE);
2423
2424         /* sk->sk_memcg will be populated at accept() time */
2425         newsk->sk_memcg = NULL;
2426
2427         cgroup_sk_clone(&newsk->sk_cgrp_data);
2428
2429         rcu_read_lock();
2430         filter = rcu_dereference(sk->sk_filter);
2431         if (filter != NULL)
2432                 /* though it's an empty new sock, the charging may fail
2433                  * if sysctl_optmem_max was changed between creation of
2434                  * original socket and cloning
2435                  */
2436                 is_charged = sk_filter_charge(newsk, filter);
2437         RCU_INIT_POINTER(newsk->sk_filter, filter);
2438         rcu_read_unlock();
2439
2440         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2441                 /* We need to make sure that we don't uncharge the new
2442                  * socket if we couldn't charge it in the first place
2443                  * as otherwise we uncharge the parent's filter.
2444                  */
2445                 if (!is_charged)
2446                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2447                 sk_free_unlock_clone(newsk);
2448                 newsk = NULL;
2449                 goto out;
2450         }
2451         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2452
2453         if (bpf_sk_storage_clone(sk, newsk)) {
2454                 sk_free_unlock_clone(newsk);
2455                 newsk = NULL;
2456                 goto out;
2457         }
2458
2459         /* Clear sk_user_data if parent had the pointer tagged
2460          * as not suitable for copying when cloning.
2461          */
2462         if (sk_user_data_is_nocopy(newsk))
2463                 newsk->sk_user_data = NULL;
2464
2465         newsk->sk_err      = 0;
2466         newsk->sk_err_soft = 0;
2467         newsk->sk_priority = 0;
2468         newsk->sk_incoming_cpu = raw_smp_processor_id();
2469
2470         /* Before updating sk_refcnt, we must commit prior changes to memory
2471          * (Documentation/RCU/rculist_nulls.rst for details)
2472          */
2473         smp_wmb();
2474         refcount_set(&newsk->sk_refcnt, 2);
2475
2476         sk_set_socket(newsk, NULL);
2477         sk_tx_queue_clear(newsk);
2478         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2479
2480         if (newsk->sk_prot->sockets_allocated)
2481                 sk_sockets_allocated_inc(newsk);
2482
2483         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2484                 net_enable_timestamp();
2485 out:
2486         return newsk;
2487 }
2488 EXPORT_SYMBOL_GPL(sk_clone_lock);
2489
2490 void sk_free_unlock_clone(struct sock *sk)
2491 {
2492         /* It is still raw copy of parent, so invalidate
2493          * destructor and make plain sk_free() */
2494         sk->sk_destruct = NULL;
2495         bh_unlock_sock(sk);
2496         sk_free(sk);
2497 }
2498 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2499
2500 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2501 {
2502         bool is_ipv6 = false;
2503         u32 max_size;
2504
2505 #if IS_ENABLED(CONFIG_IPV6)
2506         is_ipv6 = (sk->sk_family == AF_INET6 &&
2507                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2508 #endif
2509         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2510         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2511                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2512         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2513                 max_size = GSO_LEGACY_MAX_SIZE;
2514
2515         return max_size - (MAX_TCP_HEADER + 1);
2516 }
2517
2518 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2519 {
2520         u32 max_segs = 1;
2521
2522         sk->sk_route_caps = dst->dev->features;
2523         if (sk_is_tcp(sk))
2524                 sk->sk_route_caps |= NETIF_F_GSO;
2525         if (sk->sk_route_caps & NETIF_F_GSO)
2526                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2527         if (unlikely(sk->sk_gso_disabled))
2528                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2529         if (sk_can_gso(sk)) {
2530                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2531                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2532                 } else {
2533                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2534                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2535                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2536                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2537                 }
2538         }
2539         sk->sk_gso_max_segs = max_segs;
2540         sk_dst_set(sk, dst);
2541 }
2542 EXPORT_SYMBOL_GPL(sk_setup_caps);
2543
2544 /*
2545  *      Simple resource managers for sockets.
2546  */
2547
2548
2549 /*
2550  * Write buffer destructor automatically called from kfree_skb.
2551  */
2552 void sock_wfree(struct sk_buff *skb)
2553 {
2554         struct sock *sk = skb->sk;
2555         unsigned int len = skb->truesize;
2556         bool free;
2557
2558         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2559                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2560                     sk->sk_write_space == sock_def_write_space) {
2561                         rcu_read_lock();
2562                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2563                         sock_def_write_space_wfree(sk);
2564                         rcu_read_unlock();
2565                         if (unlikely(free))
2566                                 __sk_free(sk);
2567                         return;
2568                 }
2569
2570                 /*
2571                  * Keep a reference on sk_wmem_alloc, this will be released
2572                  * after sk_write_space() call
2573                  */
2574                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2575                 sk->sk_write_space(sk);
2576                 len = 1;
2577         }
2578         /*
2579          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2580          * could not do because of in-flight packets
2581          */
2582         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2583                 __sk_free(sk);
2584 }
2585 EXPORT_SYMBOL(sock_wfree);
2586
2587 /* This variant of sock_wfree() is used by TCP,
2588  * since it sets SOCK_USE_WRITE_QUEUE.
2589  */
2590 void __sock_wfree(struct sk_buff *skb)
2591 {
2592         struct sock *sk = skb->sk;
2593
2594         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2595                 __sk_free(sk);
2596 }
2597
2598 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2599 {
2600         skb_orphan(skb);
2601 #ifdef CONFIG_INET
2602         if (unlikely(!sk_fullsock(sk)))
2603                 return skb_set_owner_edemux(skb, sk);
2604 #endif
2605         skb->sk = sk;
2606         skb->destructor = sock_wfree;
2607         skb_set_hash_from_sk(skb, sk);
2608         /*
2609          * We used to take a refcount on sk, but following operation
2610          * is enough to guarantee sk_free() won't free this sock until
2611          * all in-flight packets are completed
2612          */
2613         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2614 }
2615 EXPORT_SYMBOL(skb_set_owner_w);
2616
2617 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2618 {
2619         /* Drivers depend on in-order delivery for crypto offload,
2620          * partial orphan breaks out-of-order-OK logic.
2621          */
2622         if (skb_is_decrypted(skb))
2623                 return false;
2624
2625         return (skb->destructor == sock_wfree ||
2626                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2627 }
2628
2629 /* This helper is used by netem, as it can hold packets in its
2630  * delay queue. We want to allow the owner socket to send more
2631  * packets, as if they were already TX completed by a typical driver.
2632  * But we also want to keep skb->sk set because some packet schedulers
2633  * rely on it (sch_fq for example).
2634  */
2635 void skb_orphan_partial(struct sk_buff *skb)
2636 {
2637         if (skb_is_tcp_pure_ack(skb))
2638                 return;
2639
2640         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2641                 return;
2642
2643         skb_orphan(skb);
2644 }
2645 EXPORT_SYMBOL(skb_orphan_partial);
2646
2647 /*
2648  * Read buffer destructor automatically called from kfree_skb.
2649  */
2650 void sock_rfree(struct sk_buff *skb)
2651 {
2652         struct sock *sk = skb->sk;
2653         unsigned int len = skb->truesize;
2654
2655         atomic_sub(len, &sk->sk_rmem_alloc);
2656         sk_mem_uncharge(sk, len);
2657 }
2658 EXPORT_SYMBOL(sock_rfree);
2659
2660 /*
2661  * Buffer destructor for skbs that are not used directly in read or write
2662  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2663  */
2664 void sock_efree(struct sk_buff *skb)
2665 {
2666         sock_put(skb->sk);
2667 }
2668 EXPORT_SYMBOL(sock_efree);
2669
2670 /* Buffer destructor for prefetch/receive path where reference count may
2671  * not be held, e.g. for listen sockets.
2672  */
2673 #ifdef CONFIG_INET
2674 void sock_pfree(struct sk_buff *skb)
2675 {
2676         struct sock *sk = skb->sk;
2677
2678         if (!sk_is_refcounted(sk))
2679                 return;
2680
2681         if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2682                 inet_reqsk(sk)->rsk_listener = NULL;
2683                 reqsk_free(inet_reqsk(sk));
2684                 return;
2685         }
2686
2687         sock_gen_put(sk);
2688 }
2689 EXPORT_SYMBOL(sock_pfree);
2690 #endif /* CONFIG_INET */
2691
2692 kuid_t sock_i_uid(struct sock *sk)
2693 {
2694         kuid_t uid;
2695
2696         read_lock_bh(&sk->sk_callback_lock);
2697         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2698         read_unlock_bh(&sk->sk_callback_lock);
2699         return uid;
2700 }
2701 EXPORT_SYMBOL(sock_i_uid);
2702
2703 unsigned long __sock_i_ino(struct sock *sk)
2704 {
2705         unsigned long ino;
2706
2707         read_lock(&sk->sk_callback_lock);
2708         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2709         read_unlock(&sk->sk_callback_lock);
2710         return ino;
2711 }
2712 EXPORT_SYMBOL(__sock_i_ino);
2713
2714 unsigned long sock_i_ino(struct sock *sk)
2715 {
2716         unsigned long ino;
2717
2718         local_bh_disable();
2719         ino = __sock_i_ino(sk);
2720         local_bh_enable();
2721         return ino;
2722 }
2723 EXPORT_SYMBOL(sock_i_ino);
2724
2725 /*
2726  * Allocate a skb from the socket's send buffer.
2727  */
2728 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2729                              gfp_t priority)
2730 {
2731         if (force ||
2732             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2733                 struct sk_buff *skb = alloc_skb(size, priority);
2734
2735                 if (skb) {
2736                         skb_set_owner_w(skb, sk);
2737                         return skb;
2738                 }
2739         }
2740         return NULL;
2741 }
2742 EXPORT_SYMBOL(sock_wmalloc);
2743
2744 static void sock_ofree(struct sk_buff *skb)
2745 {
2746         struct sock *sk = skb->sk;
2747
2748         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2749 }
2750
2751 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2752                              gfp_t priority)
2753 {
2754         struct sk_buff *skb;
2755
2756         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2757         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2758             READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2759                 return NULL;
2760
2761         skb = alloc_skb(size, priority);
2762         if (!skb)
2763                 return NULL;
2764
2765         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2766         skb->sk = sk;
2767         skb->destructor = sock_ofree;
2768         return skb;
2769 }
2770
2771 /*
2772  * Allocate a memory block from the socket's option memory buffer.
2773  */
2774 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2775 {
2776         int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2777
2778         if ((unsigned int)size <= optmem_max &&
2779             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2780                 void *mem;
2781                 /* First do the add, to avoid the race if kmalloc
2782                  * might sleep.
2783                  */
2784                 atomic_add(size, &sk->sk_omem_alloc);
2785                 mem = kmalloc(size, priority);
2786                 if (mem)
2787                         return mem;
2788                 atomic_sub(size, &sk->sk_omem_alloc);
2789         }
2790         return NULL;
2791 }
2792 EXPORT_SYMBOL(sock_kmalloc);
2793
2794 /* Free an option memory block. Note, we actually want the inline
2795  * here as this allows gcc to detect the nullify and fold away the
2796  * condition entirely.
2797  */
2798 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2799                                   const bool nullify)
2800 {
2801         if (WARN_ON_ONCE(!mem))
2802                 return;
2803         if (nullify)
2804                 kfree_sensitive(mem);
2805         else
2806                 kfree(mem);
2807         atomic_sub(size, &sk->sk_omem_alloc);
2808 }
2809
2810 void sock_kfree_s(struct sock *sk, void *mem, int size)
2811 {
2812         __sock_kfree_s(sk, mem, size, false);
2813 }
2814 EXPORT_SYMBOL(sock_kfree_s);
2815
2816 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2817 {
2818         __sock_kfree_s(sk, mem, size, true);
2819 }
2820 EXPORT_SYMBOL(sock_kzfree_s);
2821
2822 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2823    I think, these locks should be removed for datagram sockets.
2824  */
2825 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2826 {
2827         DEFINE_WAIT(wait);
2828
2829         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2830         for (;;) {
2831                 if (!timeo)
2832                         break;
2833                 if (signal_pending(current))
2834                         break;
2835                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2836                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2837                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2838                         break;
2839                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2840                         break;
2841                 if (READ_ONCE(sk->sk_err))
2842                         break;
2843                 timeo = schedule_timeout(timeo);
2844         }
2845         finish_wait(sk_sleep(sk), &wait);
2846         return timeo;
2847 }
2848
2849
2850 /*
2851  *      Generic send/receive buffer handlers
2852  */
2853
2854 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2855                                      unsigned long data_len, int noblock,
2856                                      int *errcode, int max_page_order)
2857 {
2858         struct sk_buff *skb;
2859         long timeo;
2860         int err;
2861
2862         timeo = sock_sndtimeo(sk, noblock);
2863         for (;;) {
2864                 err = sock_error(sk);
2865                 if (err != 0)
2866                         goto failure;
2867
2868                 err = -EPIPE;
2869                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2870                         goto failure;
2871
2872                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2873                         break;
2874
2875                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2876                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2877                 err = -EAGAIN;
2878                 if (!timeo)
2879                         goto failure;
2880                 if (signal_pending(current))
2881                         goto interrupted;
2882                 timeo = sock_wait_for_wmem(sk, timeo);
2883         }
2884         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2885                                    errcode, sk->sk_allocation);
2886         if (skb)
2887                 skb_set_owner_w(skb, sk);
2888         return skb;
2889
2890 interrupted:
2891         err = sock_intr_errno(timeo);
2892 failure:
2893         *errcode = err;
2894         return NULL;
2895 }
2896 EXPORT_SYMBOL(sock_alloc_send_pskb);
2897
2898 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2899                      struct sockcm_cookie *sockc)
2900 {
2901         u32 tsflags;
2902
2903         BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2904
2905         switch (cmsg->cmsg_type) {
2906         case SO_MARK:
2907                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2908                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2909                         return -EPERM;
2910                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2911                         return -EINVAL;
2912                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2913                 break;
2914         case SO_TIMESTAMPING_OLD:
2915         case SO_TIMESTAMPING_NEW:
2916                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2917                         return -EINVAL;
2918
2919                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2920                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2921                         return -EINVAL;
2922
2923                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2924                 sockc->tsflags |= tsflags;
2925                 break;
2926         case SCM_TXTIME:
2927                 if (!sock_flag(sk, SOCK_TXTIME))
2928                         return -EINVAL;
2929                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2930                         return -EINVAL;
2931                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2932                 break;
2933         case SCM_TS_OPT_ID:
2934                 if (sk_is_tcp(sk))
2935                         return -EINVAL;
2936                 tsflags = READ_ONCE(sk->sk_tsflags);
2937                 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2938                         return -EINVAL;
2939                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2940                         return -EINVAL;
2941                 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2942                 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2943                 break;
2944         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2945         case SCM_RIGHTS:
2946         case SCM_CREDENTIALS:
2947                 break;
2948         default:
2949                 return -EINVAL;
2950         }
2951         return 0;
2952 }
2953 EXPORT_SYMBOL(__sock_cmsg_send);
2954
2955 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2956                    struct sockcm_cookie *sockc)
2957 {
2958         struct cmsghdr *cmsg;
2959         int ret;
2960
2961         for_each_cmsghdr(cmsg, msg) {
2962                 if (!CMSG_OK(msg, cmsg))
2963                         return -EINVAL;
2964                 if (cmsg->cmsg_level != SOL_SOCKET)
2965                         continue;
2966                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2967                 if (ret)
2968                         return ret;
2969         }
2970         return 0;
2971 }
2972 EXPORT_SYMBOL(sock_cmsg_send);
2973
2974 static void sk_enter_memory_pressure(struct sock *sk)
2975 {
2976         if (!sk->sk_prot->enter_memory_pressure)
2977                 return;
2978
2979         sk->sk_prot->enter_memory_pressure(sk);
2980 }
2981
2982 static void sk_leave_memory_pressure(struct sock *sk)
2983 {
2984         if (sk->sk_prot->leave_memory_pressure) {
2985                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2986                                      tcp_leave_memory_pressure, sk);
2987         } else {
2988                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2989
2990                 if (memory_pressure && READ_ONCE(*memory_pressure))
2991                         WRITE_ONCE(*memory_pressure, 0);
2992         }
2993 }
2994
2995 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2996
2997 /**
2998  * skb_page_frag_refill - check that a page_frag contains enough room
2999  * @sz: minimum size of the fragment we want to get
3000  * @pfrag: pointer to page_frag
3001  * @gfp: priority for memory allocation
3002  *
3003  * Note: While this allocator tries to use high order pages, there is
3004  * no guarantee that allocations succeed. Therefore, @sz MUST be
3005  * less or equal than PAGE_SIZE.
3006  */
3007 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3008 {
3009         if (pfrag->page) {
3010                 if (page_ref_count(pfrag->page) == 1) {
3011                         pfrag->offset = 0;
3012                         return true;
3013                 }
3014                 if (pfrag->offset + sz <= pfrag->size)
3015                         return true;
3016                 put_page(pfrag->page);
3017         }
3018
3019         pfrag->offset = 0;
3020         if (SKB_FRAG_PAGE_ORDER &&
3021             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3022                 /* Avoid direct reclaim but allow kswapd to wake */
3023                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3024                                           __GFP_COMP | __GFP_NOWARN |
3025                                           __GFP_NORETRY,
3026                                           SKB_FRAG_PAGE_ORDER);
3027                 if (likely(pfrag->page)) {
3028                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3029                         return true;
3030                 }
3031         }
3032         pfrag->page = alloc_page(gfp);
3033         if (likely(pfrag->page)) {
3034                 pfrag->size = PAGE_SIZE;
3035                 return true;
3036         }
3037         return false;
3038 }
3039 EXPORT_SYMBOL(skb_page_frag_refill);
3040
3041 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3042 {
3043         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3044                 return true;
3045
3046         sk_enter_memory_pressure(sk);
3047         sk_stream_moderate_sndbuf(sk);
3048         return false;
3049 }
3050 EXPORT_SYMBOL(sk_page_frag_refill);
3051
3052 void __lock_sock(struct sock *sk)
3053         __releases(&sk->sk_lock.slock)
3054         __acquires(&sk->sk_lock.slock)
3055 {
3056         DEFINE_WAIT(wait);
3057
3058         for (;;) {
3059                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3060                                         TASK_UNINTERRUPTIBLE);
3061                 spin_unlock_bh(&sk->sk_lock.slock);
3062                 schedule();
3063                 spin_lock_bh(&sk->sk_lock.slock);
3064                 if (!sock_owned_by_user(sk))
3065                         break;
3066         }
3067         finish_wait(&sk->sk_lock.wq, &wait);
3068 }
3069
3070 void __release_sock(struct sock *sk)
3071         __releases(&sk->sk_lock.slock)
3072         __acquires(&sk->sk_lock.slock)
3073 {
3074         struct sk_buff *skb, *next;
3075
3076         while ((skb = sk->sk_backlog.head) != NULL) {
3077                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3078
3079                 spin_unlock_bh(&sk->sk_lock.slock);
3080
3081                 do {
3082                         next = skb->next;
3083                         prefetch(next);
3084                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3085                         skb_mark_not_on_list(skb);
3086                         sk_backlog_rcv(sk, skb);
3087
3088                         cond_resched();
3089
3090                         skb = next;
3091                 } while (skb != NULL);
3092
3093                 spin_lock_bh(&sk->sk_lock.slock);
3094         }
3095
3096         /*
3097          * Doing the zeroing here guarantee we can not loop forever
3098          * while a wild producer attempts to flood us.
3099          */
3100         sk->sk_backlog.len = 0;
3101 }
3102
3103 void __sk_flush_backlog(struct sock *sk)
3104 {
3105         spin_lock_bh(&sk->sk_lock.slock);
3106         __release_sock(sk);
3107
3108         if (sk->sk_prot->release_cb)
3109                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3110                                      tcp_release_cb, sk);
3111
3112         spin_unlock_bh(&sk->sk_lock.slock);
3113 }
3114 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3115
3116 /**
3117  * sk_wait_data - wait for data to arrive at sk_receive_queue
3118  * @sk:    sock to wait on
3119  * @timeo: for how long
3120  * @skb:   last skb seen on sk_receive_queue
3121  *
3122  * Now socket state including sk->sk_err is changed only under lock,
3123  * hence we may omit checks after joining wait queue.
3124  * We check receive queue before schedule() only as optimization;
3125  * it is very likely that release_sock() added new data.
3126  */
3127 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3128 {
3129         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3130         int rc;
3131
3132         add_wait_queue(sk_sleep(sk), &wait);
3133         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3134         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3135         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3136         remove_wait_queue(sk_sleep(sk), &wait);
3137         return rc;
3138 }
3139 EXPORT_SYMBOL(sk_wait_data);
3140
3141 /**
3142  *      __sk_mem_raise_allocated - increase memory_allocated
3143  *      @sk: socket
3144  *      @size: memory size to allocate
3145  *      @amt: pages to allocate
3146  *      @kind: allocation type
3147  *
3148  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3149  *
3150  *      Unlike the globally shared limits among the sockets under same protocol,
3151  *      consuming the budget of a memcg won't have direct effect on other ones.
3152  *      So be optimistic about memcg's tolerance, and leave the callers to decide
3153  *      whether or not to raise allocated through sk_under_memory_pressure() or
3154  *      its variants.
3155  */
3156 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3157 {
3158         struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3159         struct proto *prot = sk->sk_prot;
3160         bool charged = false;
3161         long allocated;
3162
3163         sk_memory_allocated_add(sk, amt);
3164         allocated = sk_memory_allocated(sk);
3165
3166         if (memcg) {
3167                 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3168                         goto suppress_allocation;
3169                 charged = true;
3170         }
3171
3172         /* Under limit. */
3173         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3174                 sk_leave_memory_pressure(sk);
3175                 return 1;
3176         }
3177
3178         /* Under pressure. */
3179         if (allocated > sk_prot_mem_limits(sk, 1))
3180                 sk_enter_memory_pressure(sk);
3181
3182         /* Over hard limit. */
3183         if (allocated > sk_prot_mem_limits(sk, 2))
3184                 goto suppress_allocation;
3185
3186         /* Guarantee minimum buffer size under pressure (either global
3187          * or memcg) to make sure features described in RFC 7323 (TCP
3188          * Extensions for High Performance) work properly.
3189          *
3190          * This rule does NOT stand when exceeds global or memcg's hard
3191          * limit, or else a DoS attack can be taken place by spawning
3192          * lots of sockets whose usage are under minimum buffer size.
3193          */
3194         if (kind == SK_MEM_RECV) {
3195                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3196                         return 1;
3197
3198         } else { /* SK_MEM_SEND */
3199                 int wmem0 = sk_get_wmem0(sk, prot);
3200
3201                 if (sk->sk_type == SOCK_STREAM) {
3202                         if (sk->sk_wmem_queued < wmem0)
3203                                 return 1;
3204                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3205                                 return 1;
3206                 }
3207         }
3208
3209         if (sk_has_memory_pressure(sk)) {
3210                 u64 alloc;
3211
3212                 /* The following 'average' heuristic is within the
3213                  * scope of global accounting, so it only makes
3214                  * sense for global memory pressure.
3215                  */
3216                 if (!sk_under_global_memory_pressure(sk))
3217                         return 1;
3218
3219                 /* Try to be fair among all the sockets under global
3220                  * pressure by allowing the ones that below average
3221                  * usage to raise.
3222                  */
3223                 alloc = sk_sockets_allocated_read_positive(sk);
3224                 if (sk_prot_mem_limits(sk, 2) > alloc *
3225                     sk_mem_pages(sk->sk_wmem_queued +
3226                                  atomic_read(&sk->sk_rmem_alloc) +
3227                                  sk->sk_forward_alloc))
3228                         return 1;
3229         }
3230
3231 suppress_allocation:
3232
3233         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3234                 sk_stream_moderate_sndbuf(sk);
3235
3236                 /* Fail only if socket is _under_ its sndbuf.
3237                  * In this case we cannot block, so that we have to fail.
3238                  */
3239                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3240                         /* Force charge with __GFP_NOFAIL */
3241                         if (memcg && !charged) {
3242                                 mem_cgroup_charge_skmem(memcg, amt,
3243                                         gfp_memcg_charge() | __GFP_NOFAIL);
3244                         }
3245                         return 1;
3246                 }
3247         }
3248
3249         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3250                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3251
3252         sk_memory_allocated_sub(sk, amt);
3253
3254         if (charged)
3255                 mem_cgroup_uncharge_skmem(memcg, amt);
3256
3257         return 0;
3258 }
3259
3260 /**
3261  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3262  *      @sk: socket
3263  *      @size: memory size to allocate
3264  *      @kind: allocation type
3265  *
3266  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3267  *      rmem allocation. This function assumes that protocols which have
3268  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3269  */
3270 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3271 {
3272         int ret, amt = sk_mem_pages(size);
3273
3274         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3275         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3276         if (!ret)
3277                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3278         return ret;
3279 }
3280 EXPORT_SYMBOL(__sk_mem_schedule);
3281
3282 /**
3283  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3284  *      @sk: socket
3285  *      @amount: number of quanta
3286  *
3287  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3288  */
3289 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3290 {
3291         sk_memory_allocated_sub(sk, amount);
3292
3293         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3294                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3295
3296         if (sk_under_global_memory_pressure(sk) &&
3297             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3298                 sk_leave_memory_pressure(sk);
3299 }
3300
3301 /**
3302  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3303  *      @sk: socket
3304  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3305  */
3306 void __sk_mem_reclaim(struct sock *sk, int amount)
3307 {
3308         amount >>= PAGE_SHIFT;
3309         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3310         __sk_mem_reduce_allocated(sk, amount);
3311 }
3312 EXPORT_SYMBOL(__sk_mem_reclaim);
3313
3314 int sk_set_peek_off(struct sock *sk, int val)
3315 {
3316         WRITE_ONCE(sk->sk_peek_off, val);
3317         return 0;
3318 }
3319 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3320
3321 /*
3322  * Set of default routines for initialising struct proto_ops when
3323  * the protocol does not support a particular function. In certain
3324  * cases where it makes no sense for a protocol to have a "do nothing"
3325  * function, some default processing is provided.
3326  */
3327
3328 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3329 {
3330         return -EOPNOTSUPP;
3331 }
3332 EXPORT_SYMBOL(sock_no_bind);
3333
3334 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3335                     int len, int flags)
3336 {
3337         return -EOPNOTSUPP;
3338 }
3339 EXPORT_SYMBOL(sock_no_connect);
3340
3341 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3342 {
3343         return -EOPNOTSUPP;
3344 }
3345 EXPORT_SYMBOL(sock_no_socketpair);
3346
3347 int sock_no_accept(struct socket *sock, struct socket *newsock,
3348                    struct proto_accept_arg *arg)
3349 {
3350         return -EOPNOTSUPP;
3351 }
3352 EXPORT_SYMBOL(sock_no_accept);
3353
3354 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3355                     int peer)
3356 {
3357         return -EOPNOTSUPP;
3358 }
3359 EXPORT_SYMBOL(sock_no_getname);
3360
3361 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3362 {
3363         return -EOPNOTSUPP;
3364 }
3365 EXPORT_SYMBOL(sock_no_ioctl);
3366
3367 int sock_no_listen(struct socket *sock, int backlog)
3368 {
3369         return -EOPNOTSUPP;
3370 }
3371 EXPORT_SYMBOL(sock_no_listen);
3372
3373 int sock_no_shutdown(struct socket *sock, int how)
3374 {
3375         return -EOPNOTSUPP;
3376 }
3377 EXPORT_SYMBOL(sock_no_shutdown);
3378
3379 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3380 {
3381         return -EOPNOTSUPP;
3382 }
3383 EXPORT_SYMBOL(sock_no_sendmsg);
3384
3385 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3386 {
3387         return -EOPNOTSUPP;
3388 }
3389 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3390
3391 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3392                     int flags)
3393 {
3394         return -EOPNOTSUPP;
3395 }
3396 EXPORT_SYMBOL(sock_no_recvmsg);
3397
3398 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3399 {
3400         /* Mirror missing mmap method error code */
3401         return -ENODEV;
3402 }
3403 EXPORT_SYMBOL(sock_no_mmap);
3404
3405 /*
3406  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3407  * various sock-based usage counts.
3408  */
3409 void __receive_sock(struct file *file)
3410 {
3411         struct socket *sock;
3412
3413         sock = sock_from_file(file);
3414         if (sock) {
3415                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3416                 sock_update_classid(&sock->sk->sk_cgrp_data);
3417         }
3418 }
3419
3420 /*
3421  *      Default Socket Callbacks
3422  */
3423
3424 static void sock_def_wakeup(struct sock *sk)
3425 {
3426         struct socket_wq *wq;
3427
3428         rcu_read_lock();
3429         wq = rcu_dereference(sk->sk_wq);
3430         if (skwq_has_sleeper(wq))
3431                 wake_up_interruptible_all(&wq->wait);
3432         rcu_read_unlock();
3433 }
3434
3435 static void sock_def_error_report(struct sock *sk)
3436 {
3437         struct socket_wq *wq;
3438
3439         rcu_read_lock();
3440         wq = rcu_dereference(sk->sk_wq);
3441         if (skwq_has_sleeper(wq))
3442                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3443         sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3444         rcu_read_unlock();
3445 }
3446
3447 void sock_def_readable(struct sock *sk)
3448 {
3449         struct socket_wq *wq;
3450
3451         trace_sk_data_ready(sk);
3452
3453         rcu_read_lock();
3454         wq = rcu_dereference(sk->sk_wq);
3455         if (skwq_has_sleeper(wq))
3456                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3457                                                 EPOLLRDNORM | EPOLLRDBAND);
3458         sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3459         rcu_read_unlock();
3460 }
3461
3462 static void sock_def_write_space(struct sock *sk)
3463 {
3464         struct socket_wq *wq;
3465
3466         rcu_read_lock();
3467
3468         /* Do not wake up a writer until he can make "significant"
3469          * progress.  --DaveM
3470          */
3471         if (sock_writeable(sk)) {
3472                 wq = rcu_dereference(sk->sk_wq);
3473                 if (skwq_has_sleeper(wq))
3474                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3475                                                 EPOLLWRNORM | EPOLLWRBAND);
3476
3477                 /* Should agree with poll, otherwise some programs break */
3478                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3479         }
3480
3481         rcu_read_unlock();
3482 }
3483
3484 /* An optimised version of sock_def_write_space(), should only be called
3485  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3486  * ->sk_wmem_alloc.
3487  */
3488 static void sock_def_write_space_wfree(struct sock *sk)
3489 {
3490         /* Do not wake up a writer until he can make "significant"
3491          * progress.  --DaveM
3492          */
3493         if (sock_writeable(sk)) {
3494                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3495
3496                 /* rely on refcount_sub from sock_wfree() */
3497                 smp_mb__after_atomic();
3498                 if (wq && waitqueue_active(&wq->wait))
3499                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3500                                                 EPOLLWRNORM | EPOLLWRBAND);
3501
3502                 /* Should agree with poll, otherwise some programs break */
3503                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3504         }
3505 }
3506
3507 static void sock_def_destruct(struct sock *sk)
3508 {
3509 }
3510
3511 void sk_send_sigurg(struct sock *sk)
3512 {
3513         if (sk->sk_socket && sk->sk_socket->file)
3514                 if (send_sigurg(sk->sk_socket->file))
3515                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3516 }
3517 EXPORT_SYMBOL(sk_send_sigurg);
3518
3519 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3520                     unsigned long expires)
3521 {
3522         if (!mod_timer(timer, expires))
3523                 sock_hold(sk);
3524 }
3525 EXPORT_SYMBOL(sk_reset_timer);
3526
3527 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3528 {
3529         if (del_timer(timer))
3530                 __sock_put(sk);
3531 }
3532 EXPORT_SYMBOL(sk_stop_timer);
3533
3534 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3535 {
3536         if (del_timer_sync(timer))
3537                 __sock_put(sk);
3538 }
3539 EXPORT_SYMBOL(sk_stop_timer_sync);
3540
3541 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3542 {
3543         sk_init_common(sk);
3544         sk->sk_send_head        =       NULL;
3545
3546         timer_setup(&sk->sk_timer, NULL, 0);
3547
3548         sk->sk_allocation       =       GFP_KERNEL;
3549         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3550         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3551         sk->sk_state            =       TCP_CLOSE;
3552         sk->sk_use_task_frag    =       true;
3553         sk_set_socket(sk, sock);
3554
3555         sock_set_flag(sk, SOCK_ZAPPED);
3556
3557         if (sock) {
3558                 sk->sk_type     =       sock->type;
3559                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3560                 sock->sk        =       sk;
3561         } else {
3562                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3563         }
3564         sk->sk_uid      =       uid;
3565
3566         sk->sk_state_change     =       sock_def_wakeup;
3567         sk->sk_data_ready       =       sock_def_readable;
3568         sk->sk_write_space      =       sock_def_write_space;
3569         sk->sk_error_report     =       sock_def_error_report;
3570         sk->sk_destruct         =       sock_def_destruct;
3571
3572         sk->sk_frag.page        =       NULL;
3573         sk->sk_frag.offset      =       0;
3574         sk->sk_peek_off         =       -1;
3575
3576         sk->sk_peer_pid         =       NULL;
3577         sk->sk_peer_cred        =       NULL;
3578         spin_lock_init(&sk->sk_peer_lock);
3579
3580         sk->sk_write_pending    =       0;
3581         sk->sk_rcvlowat         =       1;
3582         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3583         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3584
3585         sk->sk_stamp = SK_DEFAULT_STAMP;
3586 #if BITS_PER_LONG==32
3587         seqlock_init(&sk->sk_stamp_seq);
3588 #endif
3589         atomic_set(&sk->sk_zckey, 0);
3590
3591 #ifdef CONFIG_NET_RX_BUSY_POLL
3592         sk->sk_napi_id          =       0;
3593         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3594 #endif
3595
3596         sk->sk_max_pacing_rate = ~0UL;
3597         sk->sk_pacing_rate = ~0UL;
3598         WRITE_ONCE(sk->sk_pacing_shift, 10);
3599         sk->sk_incoming_cpu = -1;
3600
3601         sk_rx_queue_clear(sk);
3602         /*
3603          * Before updating sk_refcnt, we must commit prior changes to memory
3604          * (Documentation/RCU/rculist_nulls.rst for details)
3605          */
3606         smp_wmb();
3607         refcount_set(&sk->sk_refcnt, 1);
3608         atomic_set(&sk->sk_drops, 0);
3609 }
3610 EXPORT_SYMBOL(sock_init_data_uid);
3611
3612 void sock_init_data(struct socket *sock, struct sock *sk)
3613 {
3614         kuid_t uid = sock ?
3615                 SOCK_INODE(sock)->i_uid :
3616                 make_kuid(sock_net(sk)->user_ns, 0);
3617
3618         sock_init_data_uid(sock, sk, uid);
3619 }
3620 EXPORT_SYMBOL(sock_init_data);
3621
3622 void lock_sock_nested(struct sock *sk, int subclass)
3623 {
3624         /* The sk_lock has mutex_lock() semantics here. */
3625         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3626
3627         might_sleep();
3628         spin_lock_bh(&sk->sk_lock.slock);
3629         if (sock_owned_by_user_nocheck(sk))
3630                 __lock_sock(sk);
3631         sk->sk_lock.owned = 1;
3632         spin_unlock_bh(&sk->sk_lock.slock);
3633 }
3634 EXPORT_SYMBOL(lock_sock_nested);
3635
3636 void release_sock(struct sock *sk)
3637 {
3638         spin_lock_bh(&sk->sk_lock.slock);
3639         if (sk->sk_backlog.tail)
3640                 __release_sock(sk);
3641
3642         if (sk->sk_prot->release_cb)
3643                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3644                                      tcp_release_cb, sk);
3645
3646         sock_release_ownership(sk);
3647         if (waitqueue_active(&sk->sk_lock.wq))
3648                 wake_up(&sk->sk_lock.wq);
3649         spin_unlock_bh(&sk->sk_lock.slock);
3650 }
3651 EXPORT_SYMBOL(release_sock);
3652
3653 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3654 {
3655         might_sleep();
3656         spin_lock_bh(&sk->sk_lock.slock);
3657
3658         if (!sock_owned_by_user_nocheck(sk)) {
3659                 /*
3660                  * Fast path return with bottom halves disabled and
3661                  * sock::sk_lock.slock held.
3662                  *
3663                  * The 'mutex' is not contended and holding
3664                  * sock::sk_lock.slock prevents all other lockers to
3665                  * proceed so the corresponding unlock_sock_fast() can
3666                  * avoid the slow path of release_sock() completely and
3667                  * just release slock.
3668                  *
3669                  * From a semantical POV this is equivalent to 'acquiring'
3670                  * the 'mutex', hence the corresponding lockdep
3671                  * mutex_release() has to happen in the fast path of
3672                  * unlock_sock_fast().
3673                  */
3674                 return false;
3675         }
3676
3677         __lock_sock(sk);
3678         sk->sk_lock.owned = 1;
3679         __acquire(&sk->sk_lock.slock);
3680         spin_unlock_bh(&sk->sk_lock.slock);
3681         return true;
3682 }
3683 EXPORT_SYMBOL(__lock_sock_fast);
3684
3685 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3686                    bool timeval, bool time32)
3687 {
3688         struct sock *sk = sock->sk;
3689         struct timespec64 ts;
3690
3691         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3692         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3693         if (ts.tv_sec == -1)
3694                 return -ENOENT;
3695         if (ts.tv_sec == 0) {
3696                 ktime_t kt = ktime_get_real();
3697                 sock_write_timestamp(sk, kt);
3698                 ts = ktime_to_timespec64(kt);
3699         }
3700
3701         if (timeval)
3702                 ts.tv_nsec /= 1000;
3703
3704 #ifdef CONFIG_COMPAT_32BIT_TIME
3705         if (time32)
3706                 return put_old_timespec32(&ts, userstamp);
3707 #endif
3708 #ifdef CONFIG_SPARC64
3709         /* beware of padding in sparc64 timeval */
3710         if (timeval && !in_compat_syscall()) {
3711                 struct __kernel_old_timeval __user tv = {
3712                         .tv_sec = ts.tv_sec,
3713                         .tv_usec = ts.tv_nsec,
3714                 };
3715                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3716                         return -EFAULT;
3717                 return 0;
3718         }
3719 #endif
3720         return put_timespec64(&ts, userstamp);
3721 }
3722 EXPORT_SYMBOL(sock_gettstamp);
3723
3724 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3725 {
3726         if (!sock_flag(sk, flag)) {
3727                 unsigned long previous_flags = sk->sk_flags;
3728
3729                 sock_set_flag(sk, flag);
3730                 /*
3731                  * we just set one of the two flags which require net
3732                  * time stamping, but time stamping might have been on
3733                  * already because of the other one
3734                  */
3735                 if (sock_needs_netstamp(sk) &&
3736                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3737                         net_enable_timestamp();
3738         }
3739 }
3740
3741 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3742                        int level, int type)
3743 {
3744         struct sock_exterr_skb *serr;
3745         struct sk_buff *skb;
3746         int copied, err;
3747
3748         err = -EAGAIN;
3749         skb = sock_dequeue_err_skb(sk);
3750         if (skb == NULL)
3751                 goto out;
3752
3753         copied = skb->len;
3754         if (copied > len) {
3755                 msg->msg_flags |= MSG_TRUNC;
3756                 copied = len;
3757         }
3758         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3759         if (err)
3760                 goto out_free_skb;
3761
3762         sock_recv_timestamp(msg, sk, skb);
3763
3764         serr = SKB_EXT_ERR(skb);
3765         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3766
3767         msg->msg_flags |= MSG_ERRQUEUE;
3768         err = copied;
3769
3770 out_free_skb:
3771         kfree_skb(skb);
3772 out:
3773         return err;
3774 }
3775 EXPORT_SYMBOL(sock_recv_errqueue);
3776
3777 /*
3778  *      Get a socket option on an socket.
3779  *
3780  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3781  *      asynchronous errors should be reported by getsockopt. We assume
3782  *      this means if you specify SO_ERROR (otherwise what is the point of it).
3783  */
3784 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3785                            char __user *optval, int __user *optlen)
3786 {
3787         struct sock *sk = sock->sk;
3788
3789         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3790         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3791 }
3792 EXPORT_SYMBOL(sock_common_getsockopt);
3793
3794 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3795                         int flags)
3796 {
3797         struct sock *sk = sock->sk;
3798         int addr_len = 0;
3799         int err;
3800
3801         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3802         if (err >= 0)
3803                 msg->msg_namelen = addr_len;
3804         return err;
3805 }
3806 EXPORT_SYMBOL(sock_common_recvmsg);
3807
3808 /*
3809  *      Set socket options on an inet socket.
3810  */
3811 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3812                            sockptr_t optval, unsigned int optlen)
3813 {
3814         struct sock *sk = sock->sk;
3815
3816         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3817         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3818 }
3819 EXPORT_SYMBOL(sock_common_setsockopt);
3820
3821 void sk_common_release(struct sock *sk)
3822 {
3823         if (sk->sk_prot->destroy)
3824                 sk->sk_prot->destroy(sk);
3825
3826         /*
3827          * Observation: when sk_common_release is called, processes have
3828          * no access to socket. But net still has.
3829          * Step one, detach it from networking:
3830          *
3831          * A. Remove from hash tables.
3832          */
3833
3834         sk->sk_prot->unhash(sk);
3835
3836         /*
3837          * In this point socket cannot receive new packets, but it is possible
3838          * that some packets are in flight because some CPU runs receiver and
3839          * did hash table lookup before we unhashed socket. They will achieve
3840          * receive queue and will be purged by socket destructor.
3841          *
3842          * Also we still have packets pending on receive queue and probably,
3843          * our own packets waiting in device queues. sock_destroy will drain
3844          * receive queue, but transmitted packets will delay socket destruction
3845          * until the last reference will be released.
3846          */
3847
3848         sock_orphan(sk);
3849
3850         xfrm_sk_free_policy(sk);
3851
3852         sock_put(sk);
3853 }
3854 EXPORT_SYMBOL(sk_common_release);
3855
3856 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3857 {
3858         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3859
3860         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3861         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3862         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3863         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3864         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3865         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3866         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3867         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3868         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3869 }
3870
3871 #ifdef CONFIG_PROC_FS
3872 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3873
3874 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3875 {
3876         int cpu, idx = prot->inuse_idx;
3877         int res = 0;
3878
3879         for_each_possible_cpu(cpu)
3880                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3881
3882         return res >= 0 ? res : 0;
3883 }
3884 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3885
3886 int sock_inuse_get(struct net *net)
3887 {
3888         int cpu, res = 0;
3889
3890         for_each_possible_cpu(cpu)
3891                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3892
3893         return res;
3894 }
3895
3896 EXPORT_SYMBOL_GPL(sock_inuse_get);
3897
3898 static int __net_init sock_inuse_init_net(struct net *net)
3899 {
3900         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3901         if (net->core.prot_inuse == NULL)
3902                 return -ENOMEM;
3903         return 0;
3904 }
3905
3906 static void __net_exit sock_inuse_exit_net(struct net *net)
3907 {
3908         free_percpu(net->core.prot_inuse);
3909 }
3910
3911 static struct pernet_operations net_inuse_ops = {
3912         .init = sock_inuse_init_net,
3913         .exit = sock_inuse_exit_net,
3914 };
3915
3916 static __init int net_inuse_init(void)
3917 {
3918         if (register_pernet_subsys(&net_inuse_ops))
3919                 panic("Cannot initialize net inuse counters");
3920
3921         return 0;
3922 }
3923
3924 core_initcall(net_inuse_init);
3925
3926 static int assign_proto_idx(struct proto *prot)
3927 {
3928         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3929
3930         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3931                 pr_err("PROTO_INUSE_NR exhausted\n");
3932                 return -ENOSPC;
3933         }
3934
3935         set_bit(prot->inuse_idx, proto_inuse_idx);
3936         return 0;
3937 }
3938
3939 static void release_proto_idx(struct proto *prot)
3940 {
3941         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3942                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3943 }
3944 #else
3945 static inline int assign_proto_idx(struct proto *prot)
3946 {
3947         return 0;
3948 }
3949
3950 static inline void release_proto_idx(struct proto *prot)
3951 {
3952 }
3953
3954 #endif
3955
3956 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3957 {
3958         if (!twsk_prot)
3959                 return;
3960         kfree(twsk_prot->twsk_slab_name);
3961         twsk_prot->twsk_slab_name = NULL;
3962         kmem_cache_destroy(twsk_prot->twsk_slab);
3963         twsk_prot->twsk_slab = NULL;
3964 }
3965
3966 static int tw_prot_init(const struct proto *prot)
3967 {
3968         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3969
3970         if (!twsk_prot)
3971                 return 0;
3972
3973         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3974                                               prot->name);
3975         if (!twsk_prot->twsk_slab_name)
3976                 return -ENOMEM;
3977
3978         twsk_prot->twsk_slab =
3979                 kmem_cache_create(twsk_prot->twsk_slab_name,
3980                                   twsk_prot->twsk_obj_size, 0,
3981                                   SLAB_ACCOUNT | prot->slab_flags,
3982                                   NULL);
3983         if (!twsk_prot->twsk_slab) {
3984                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3985                         prot->name);
3986                 return -ENOMEM;
3987         }
3988
3989         return 0;
3990 }
3991
3992 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3993 {
3994         if (!rsk_prot)
3995                 return;
3996         kfree(rsk_prot->slab_name);
3997         rsk_prot->slab_name = NULL;
3998         kmem_cache_destroy(rsk_prot->slab);
3999         rsk_prot->slab = NULL;
4000 }
4001
4002 static int req_prot_init(const struct proto *prot)
4003 {
4004         struct request_sock_ops *rsk_prot = prot->rsk_prot;
4005
4006         if (!rsk_prot)
4007                 return 0;
4008
4009         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4010                                         prot->name);
4011         if (!rsk_prot->slab_name)
4012                 return -ENOMEM;
4013
4014         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4015                                            rsk_prot->obj_size, 0,
4016                                            SLAB_ACCOUNT | prot->slab_flags,
4017                                            NULL);
4018
4019         if (!rsk_prot->slab) {
4020                 pr_crit("%s: Can't create request sock SLAB cache!\n",
4021                         prot->name);
4022                 return -ENOMEM;
4023         }
4024         return 0;
4025 }
4026
4027 int proto_register(struct proto *prot, int alloc_slab)
4028 {
4029         int ret = -ENOBUFS;
4030
4031         if (prot->memory_allocated && !prot->sysctl_mem) {
4032                 pr_err("%s: missing sysctl_mem\n", prot->name);
4033                 return -EINVAL;
4034         }
4035         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4036                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4037                 return -EINVAL;
4038         }
4039         if (alloc_slab) {
4040                 prot->slab = kmem_cache_create_usercopy(prot->name,
4041                                         prot->obj_size, 0,
4042                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4043                                         prot->slab_flags,
4044                                         prot->useroffset, prot->usersize,
4045                                         NULL);
4046
4047                 if (prot->slab == NULL) {
4048                         pr_crit("%s: Can't create sock SLAB cache!\n",
4049                                 prot->name);
4050                         goto out;
4051                 }
4052
4053                 if (req_prot_init(prot))
4054                         goto out_free_request_sock_slab;
4055
4056                 if (tw_prot_init(prot))
4057                         goto out_free_timewait_sock_slab;
4058         }
4059
4060         mutex_lock(&proto_list_mutex);
4061         ret = assign_proto_idx(prot);
4062         if (ret) {
4063                 mutex_unlock(&proto_list_mutex);
4064                 goto out_free_timewait_sock_slab;
4065         }
4066         list_add(&prot->node, &proto_list);
4067         mutex_unlock(&proto_list_mutex);
4068         return ret;
4069
4070 out_free_timewait_sock_slab:
4071         if (alloc_slab)
4072                 tw_prot_cleanup(prot->twsk_prot);
4073 out_free_request_sock_slab:
4074         if (alloc_slab) {
4075                 req_prot_cleanup(prot->rsk_prot);
4076
4077                 kmem_cache_destroy(prot->slab);
4078                 prot->slab = NULL;
4079         }
4080 out:
4081         return ret;
4082 }
4083 EXPORT_SYMBOL(proto_register);
4084
4085 void proto_unregister(struct proto *prot)
4086 {
4087         mutex_lock(&proto_list_mutex);
4088         release_proto_idx(prot);
4089         list_del(&prot->node);
4090         mutex_unlock(&proto_list_mutex);
4091
4092         kmem_cache_destroy(prot->slab);
4093         prot->slab = NULL;
4094
4095         req_prot_cleanup(prot->rsk_prot);
4096         tw_prot_cleanup(prot->twsk_prot);
4097 }
4098 EXPORT_SYMBOL(proto_unregister);
4099
4100 int sock_load_diag_module(int family, int protocol)
4101 {
4102         if (!protocol) {
4103                 if (!sock_is_registered(family))
4104                         return -ENOENT;
4105
4106                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4107                                       NETLINK_SOCK_DIAG, family);
4108         }
4109
4110 #ifdef CONFIG_INET
4111         if (family == AF_INET &&
4112             protocol != IPPROTO_RAW &&
4113             protocol < MAX_INET_PROTOS &&
4114             !rcu_access_pointer(inet_protos[protocol]))
4115                 return -ENOENT;
4116 #endif
4117
4118         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4119                               NETLINK_SOCK_DIAG, family, protocol);
4120 }
4121 EXPORT_SYMBOL(sock_load_diag_module);
4122
4123 #ifdef CONFIG_PROC_FS
4124 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4125         __acquires(proto_list_mutex)
4126 {
4127         mutex_lock(&proto_list_mutex);
4128         return seq_list_start_head(&proto_list, *pos);
4129 }
4130
4131 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4132 {
4133         return seq_list_next(v, &proto_list, pos);
4134 }
4135
4136 static void proto_seq_stop(struct seq_file *seq, void *v)
4137         __releases(proto_list_mutex)
4138 {
4139         mutex_unlock(&proto_list_mutex);
4140 }
4141
4142 static char proto_method_implemented(const void *method)
4143 {
4144         return method == NULL ? 'n' : 'y';
4145 }
4146 static long sock_prot_memory_allocated(struct proto *proto)
4147 {
4148         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4149 }
4150
4151 static const char *sock_prot_memory_pressure(struct proto *proto)
4152 {
4153         return proto->memory_pressure != NULL ?
4154         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4155 }
4156
4157 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4158 {
4159
4160         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4161                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4162                    proto->name,
4163                    proto->obj_size,
4164                    sock_prot_inuse_get(seq_file_net(seq), proto),
4165                    sock_prot_memory_allocated(proto),
4166                    sock_prot_memory_pressure(proto),
4167                    proto->max_header,
4168                    proto->slab == NULL ? "no" : "yes",
4169                    module_name(proto->owner),
4170                    proto_method_implemented(proto->close),
4171                    proto_method_implemented(proto->connect),
4172                    proto_method_implemented(proto->disconnect),
4173                    proto_method_implemented(proto->accept),
4174                    proto_method_implemented(proto->ioctl),
4175                    proto_method_implemented(proto->init),
4176                    proto_method_implemented(proto->destroy),
4177                    proto_method_implemented(proto->shutdown),
4178                    proto_method_implemented(proto->setsockopt),
4179                    proto_method_implemented(proto->getsockopt),
4180                    proto_method_implemented(proto->sendmsg),
4181                    proto_method_implemented(proto->recvmsg),
4182                    proto_method_implemented(proto->bind),
4183                    proto_method_implemented(proto->backlog_rcv),
4184                    proto_method_implemented(proto->hash),
4185                    proto_method_implemented(proto->unhash),
4186                    proto_method_implemented(proto->get_port),
4187                    proto_method_implemented(proto->enter_memory_pressure));
4188 }
4189
4190 static int proto_seq_show(struct seq_file *seq, void *v)
4191 {
4192         if (v == &proto_list)
4193                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4194                            "protocol",
4195                            "size",
4196                            "sockets",
4197                            "memory",
4198                            "press",
4199                            "maxhdr",
4200                            "slab",
4201                            "module",
4202                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4203         else
4204                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4205         return 0;
4206 }
4207
4208 static const struct seq_operations proto_seq_ops = {
4209         .start  = proto_seq_start,
4210         .next   = proto_seq_next,
4211         .stop   = proto_seq_stop,
4212         .show   = proto_seq_show,
4213 };
4214
4215 static __net_init int proto_init_net(struct net *net)
4216 {
4217         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4218                         sizeof(struct seq_net_private)))
4219                 return -ENOMEM;
4220
4221         return 0;
4222 }
4223
4224 static __net_exit void proto_exit_net(struct net *net)
4225 {
4226         remove_proc_entry("protocols", net->proc_net);
4227 }
4228
4229
4230 static __net_initdata struct pernet_operations proto_net_ops = {
4231         .init = proto_init_net,
4232         .exit = proto_exit_net,
4233 };
4234
4235 static int __init proto_init(void)
4236 {
4237         return register_pernet_subsys(&proto_net_ops);
4238 }
4239
4240 subsys_initcall(proto_init);
4241
4242 #endif /* PROC_FS */
4243
4244 #ifdef CONFIG_NET_RX_BUSY_POLL
4245 bool sk_busy_loop_end(void *p, unsigned long start_time)
4246 {
4247         struct sock *sk = p;
4248
4249         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4250                 return true;
4251
4252         if (sk_is_udp(sk) &&
4253             !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4254                 return true;
4255
4256         return sk_busy_loop_timeout(sk, start_time);
4257 }
4258 EXPORT_SYMBOL(sk_busy_loop_end);
4259 #endif /* CONFIG_NET_RX_BUSY_POLL */
4260
4261 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4262 {
4263         if (!sk->sk_prot->bind_add)
4264                 return -EOPNOTSUPP;
4265         return sk->sk_prot->bind_add(sk, addr, addr_len);
4266 }
4267 EXPORT_SYMBOL(sock_bind_add);
4268
4269 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4270 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4271                      void __user *arg, void *karg, size_t size)
4272 {
4273         int ret;
4274
4275         if (copy_from_user(karg, arg, size))
4276                 return -EFAULT;
4277
4278         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4279         if (ret)
4280                 return ret;
4281
4282         if (copy_to_user(arg, karg, size))
4283                 return -EFAULT;
4284
4285         return 0;
4286 }
4287 EXPORT_SYMBOL(sock_ioctl_inout);
4288
4289 /* This is the most common ioctl prep function, where the result (4 bytes) is
4290  * copied back to userspace if the ioctl() returns successfully. No input is
4291  * copied from userspace as input argument.
4292  */
4293 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4294 {
4295         int ret, karg = 0;
4296
4297         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4298         if (ret)
4299                 return ret;
4300
4301         return put_user(karg, (int __user *)arg);
4302 }
4303
4304 /* A wrapper around sock ioctls, which copies the data from userspace
4305  * (depending on the protocol/ioctl), and copies back the result to userspace.
4306  * The main motivation for this function is to pass kernel memory to the
4307  * protocol ioctl callbacks, instead of userspace memory.
4308  */
4309 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4310 {
4311         int rc = 1;
4312
4313         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4314                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4315         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4316                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4317         else if (sk_is_phonet(sk))
4318                 rc = phonet_sk_ioctl(sk, cmd, arg);
4319
4320         /* If ioctl was processed, returns its value */
4321         if (rc <= 0)
4322                 return rc;
4323
4324         /* Otherwise call the default handler */
4325         return sock_ioctl_out(sk, cmd, arg);
4326 }
4327 EXPORT_SYMBOL(sk_ioctl);
4328
4329 static int __init sock_struct_check(void)
4330 {
4331         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4332         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4333         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4334         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4335         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4336
4337         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4338         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4339         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4340         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4341         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4342         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4343         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4344         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4345         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4346
4347         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4348         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4349         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4350
4351         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4352         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4353         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4354         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4355
4356         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4357         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4358         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4359         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4360         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4361         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4362         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4363         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4364         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4365         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4366         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4367         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4368         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4369         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4370         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4371         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4372
4373         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4374         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4375         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4376         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4377         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4378         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4379         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4380         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4381         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4382         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4383         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4384         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4385         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4386         return 0;
4387 }
4388
4389 core_initcall(sock_struct_check);