net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117 #include <linux/mroute.h>
 118 #include <linux/mroute6.h>
 119 #include <linux/icmpv6.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138 #include <net/bpf_sk_storage.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144 #include <net/phonet/phonet.h>
 145
 146 #include <linux/ethtool.h>
 147
 148 #include "dev.h"
 149
 150 static DEFINE_MUTEX(proto_list_mutex);
 151 static LIST_HEAD(proto_list);
 152
 153 static void sock_def_write_space_wfree(struct sock *sk);
 154 static void sock_def_write_space(struct sock *sk);
 155
 156 /**
 157  * sk_ns_capable - General socket capability test
 158  * @sk: Socket to use a capability on or through
 159  * @user_ns: The user namespace of the capability to use
 160  * @cap: The capability to use
 161  *
 162  * Test to see if the opener of the socket had when the socket was
 163  * created and the current process has the capability @cap in the user
 164  * namespace @user_ns.
 165  */
 166 bool sk_ns_capable(const struct sock *sk,
 167                    struct user_namespace *user_ns, int cap)
 168 {
 169         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 170                 ns_capable(user_ns, cap);
 171 }
 172 EXPORT_SYMBOL(sk_ns_capable);
 173
 174 /**
 175  * sk_capable - Socket global capability test
 176  * @sk: Socket to use a capability on or through
 177  * @cap: The global capability to use
 178  *
 179  * Test to see if the opener of the socket had when the socket was
 180  * created and the current process has the capability @cap in all user
 181  * namespaces.
 182  */
 183 bool sk_capable(const struct sock *sk, int cap)
 184 {
 185         return sk_ns_capable(sk, &init_user_ns, cap);
 186 }
 187 EXPORT_SYMBOL(sk_capable);
 188
 189 /**
 190  * sk_net_capable - Network namespace socket capability test
 191  * @sk: Socket to use a capability on or through
 192  * @cap: The capability to use
 193  *
 194  * Test to see if the opener of the socket had when the socket was created
 195  * and the current process has the capability @cap over the network namespace
 196  * the socket is a member of.
 197  */
 198 bool sk_net_capable(const struct sock *sk, int cap)
 199 {
 200         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 201 }
 202 EXPORT_SYMBOL(sk_net_capable);
 203
 204 /*
 205  * Each address family might have different locking rules, so we have
 206  * one slock key per address family and separate keys for internal and
 207  * userspace sockets.
 208  */
 209 static struct lock_class_key af_family_keys[AF_MAX];
 210 static struct lock_class_key af_family_kern_keys[AF_MAX];
 211 static struct lock_class_key af_family_slock_keys[AF_MAX];
 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 213
 214 /*
 215  * Make lock validator output more readable. (we pre-construct these
 216  * strings build-time, so that runtime initialization of socket
 217  * locks is fast):
 218  */
 219
 220 #define _sock_locks(x)                                            \
 221   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 222   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 223   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 224   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 225   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 226   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 227   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 228   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 229   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 230   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 231   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 232   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 233   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 234   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 235   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 236   x "AF_MCTP"  , \
 237   x "AF_MAX"
 238
 239 static const char *const af_family_key_strings[AF_MAX+1] = {
 240         _sock_locks("sk_lock-")
 241 };
 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("slock-")
 244 };
 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("clock-")
 247 };
 248
 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-sk_lock-")
 251 };
 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-slock-")
 254 };
 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-clock-")
 257 };
 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 259         _sock_locks("rlock-")
 260 };
 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("wlock-")
 263 };
 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 265         _sock_locks("elock-")
 266 };
 267
 268 /*
 269  * sk_callback_lock and sk queues locking rules are per-address-family,
 270  * so split the lock classes by using a per-AF key:
 271  */
 272 static struct lock_class_key af_callback_keys[AF_MAX];
 273 static struct lock_class_key af_rlock_keys[AF_MAX];
 274 static struct lock_class_key af_wlock_keys[AF_MAX];
 275 static struct lock_class_key af_elock_keys[AF_MAX];
 276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_branch_inc(&memalloc_socks_key);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_branch_dec(&memalloc_socks_key);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned int noreclaim_flag;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         noreclaim_flag = memalloc_noreclaim_save();
 337         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 338                                  tcp_v6_do_rcv,
 339                                  tcp_v4_do_rcv,
 340                                  sk, skb);
 341         memalloc_noreclaim_restore(noreclaim_flag);
 342
 343         return ret;
 344 }
 345 EXPORT_SYMBOL(__sk_backlog_rcv);
 346
 347 void sk_error_report(struct sock *sk)
 348 {
 349         sk->sk_error_report(sk);
 350
 351         switch (sk->sk_family) {
 352         case AF_INET:
 353                 fallthrough;
 354         case AF_INET6:
 355                 trace_inet_sk_error_report(sk);
 356                 break;
 357         default:
 358                 break;
 359         }
 360 }
 361 EXPORT_SYMBOL(sk_error_report);
 362
 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 364 {
 365         struct __kernel_sock_timeval tv;
 366
 367         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 368                 tv.tv_sec = 0;
 369                 tv.tv_usec = 0;
 370         } else {
 371                 tv.tv_sec = timeo / HZ;
 372                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 373         }
 374
 375         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 376                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 377                 *(struct old_timeval32 *)optval = tv32;
 378                 return sizeof(tv32);
 379         }
 380
 381         if (old_timeval) {
 382                 struct __kernel_old_timeval old_tv;
 383                 old_tv.tv_sec = tv.tv_sec;
 384                 old_tv.tv_usec = tv.tv_usec;
 385                 *(struct __kernel_old_timeval *)optval = old_tv;
 386                 return sizeof(old_tv);
 387         }
 388
 389         *(struct __kernel_sock_timeval *)optval = tv;
 390         return sizeof(tv);
 391 }
 392 EXPORT_SYMBOL(sock_get_timeout);
 393
 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 395                            sockptr_t optval, int optlen, bool old_timeval)
 396 {
 397         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 398                 struct old_timeval32 tv32;
 399
 400                 if (optlen < sizeof(tv32))
 401                         return -EINVAL;
 402
 403                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 404                         return -EFAULT;
 405                 tv->tv_sec = tv32.tv_sec;
 406                 tv->tv_usec = tv32.tv_usec;
 407         } else if (old_timeval) {
 408                 struct __kernel_old_timeval old_tv;
 409
 410                 if (optlen < sizeof(old_tv))
 411                         return -EINVAL;
 412                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 413                         return -EFAULT;
 414                 tv->tv_sec = old_tv.tv_sec;
 415                 tv->tv_usec = old_tv.tv_usec;
 416         } else {
 417                 if (optlen < sizeof(*tv))
 418                         return -EINVAL;
 419                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 420                         return -EFAULT;
 421         }
 422
 423         return 0;
 424 }
 425 EXPORT_SYMBOL(sock_copy_user_timeval);
 426
 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 428                             bool old_timeval)
 429 {
 430         struct __kernel_sock_timeval tv;
 431         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 432         long val;
 433
 434         if (err)
 435                 return err;
 436
 437         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 438                 return -EDOM;
 439
 440         if (tv.tv_sec < 0) {
 441                 static int warned __read_mostly;
 442
 443                 WRITE_ONCE(*timeo_p, 0);
 444                 if (warned < 10 && net_ratelimit()) {
 445                         warned++;
 446                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 447                                 __func__, current->comm, task_pid_nr(current));
 448                 }
 449                 return 0;
 450         }
 451         val = MAX_SCHEDULE_TIMEOUT;
 452         if ((tv.tv_sec || tv.tv_usec) &&
 453             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 454                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 455                                                     USEC_PER_SEC / HZ);
 456         WRITE_ONCE(*timeo_p, val);
 457         return 0;
 458 }
 459
 460 static bool sock_needs_netstamp(const struct sock *sk)
 461 {
 462         switch (sk->sk_family) {
 463         case AF_UNSPEC:
 464         case AF_UNIX:
 465                 return false;
 466         default:
 467                 return true;
 468         }
 469 }
 470
 471 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 472 {
 473         if (sk->sk_flags & flags) {
 474                 sk->sk_flags &= ~flags;
 475                 if (sock_needs_netstamp(sk) &&
 476                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 477                         net_disable_timestamp();
 478         }
 479 }
 480
 481
 482 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 483 {
 484         unsigned long flags;
 485         struct sk_buff_head *list = &sk->sk_receive_queue;
 486
 487         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 488                 atomic_inc(&sk->sk_drops);
 489                 trace_sock_rcvqueue_full(sk, skb);
 490                 return -ENOMEM;
 491         }
 492
 493         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 494                 atomic_inc(&sk->sk_drops);
 495                 return -ENOBUFS;
 496         }
 497
 498         skb->dev = NULL;
 499         skb_set_owner_r(skb, sk);
 500
 501         /* we escape from rcu protected region, make sure we dont leak
 502          * a norefcounted dst
 503          */
 504         skb_dst_force(skb);
 505
 506         spin_lock_irqsave(&list->lock, flags);
 507         sock_skb_set_dropcount(sk, skb);
 508         __skb_queue_tail(list, skb);
 509         spin_unlock_irqrestore(&list->lock, flags);
 510
 511         if (!sock_flag(sk, SOCK_DEAD))
 512                 sk->sk_data_ready(sk);
 513         return 0;
 514 }
 515 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 516
 517 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 518                               enum skb_drop_reason *reason)
 519 {
 520         enum skb_drop_reason drop_reason;
 521         int err;
 522
 523         err = sk_filter(sk, skb);
 524         if (err) {
 525                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 526                 goto out;
 527         }
 528         err = __sock_queue_rcv_skb(sk, skb);
 529         switch (err) {
 530         case -ENOMEM:
 531                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 532                 break;
 533         case -ENOBUFS:
 534                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 535                 break;
 536         default:
 537                 drop_reason = SKB_NOT_DROPPED_YET;
 538                 break;
 539         }
 540 out:
 541         if (reason)
 542                 *reason = drop_reason;
 543         return err;
 544 }
 545 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 546
 547 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 548                      const int nested, unsigned int trim_cap, bool refcounted)
 549 {
 550         int rc = NET_RX_SUCCESS;
 551
 552         if (sk_filter_trim_cap(sk, skb, trim_cap))
 553                 goto discard_and_relse;
 554
 555         skb->dev = NULL;
 556
 557         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 558                 atomic_inc(&sk->sk_drops);
 559                 goto discard_and_relse;
 560         }
 561         if (nested)
 562                 bh_lock_sock_nested(sk);
 563         else
 564                 bh_lock_sock(sk);
 565         if (!sock_owned_by_user(sk)) {
 566                 /*
 567                  * trylock + unlock semantics:
 568                  */
 569                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 570
 571                 rc = sk_backlog_rcv(sk, skb);
 572
 573                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 574         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 575                 bh_unlock_sock(sk);
 576                 atomic_inc(&sk->sk_drops);
 577                 goto discard_and_relse;
 578         }
 579
 580         bh_unlock_sock(sk);
 581 out:
 582         if (refcounted)
 583                 sock_put(sk);
 584         return rc;
 585 discard_and_relse:
 586         kfree_skb(skb);
 587         goto out;
 588 }
 589 EXPORT_SYMBOL(__sk_receive_skb);
 590
 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 592                                                           u32));
 593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 594                                                            u32));
 595 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 596 {
 597         struct dst_entry *dst = __sk_dst_get(sk);
 598
 599         if (dst && dst->obsolete &&
 600             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 601                                dst, cookie) == NULL) {
 602                 sk_tx_queue_clear(sk);
 603                 sk->sk_dst_pending_confirm = 0;
 604                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 605                 dst_release(dst);
 606                 return NULL;
 607         }
 608
 609         return dst;
 610 }
 611 EXPORT_SYMBOL(__sk_dst_check);
 612
 613 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 614 {
 615         struct dst_entry *dst = sk_dst_get(sk);
 616
 617         if (dst && dst->obsolete &&
 618             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 619                                dst, cookie) == NULL) {
 620                 sk_dst_reset(sk);
 621                 dst_release(dst);
 622                 return NULL;
 623         }
 624
 625         return dst;
 626 }
 627 EXPORT_SYMBOL(sk_dst_check);
 628
 629 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 630 {
 631         int ret = -ENOPROTOOPT;
 632 #ifdef CONFIG_NETDEVICES
 633         struct net *net = sock_net(sk);
 634
 635         /* Sorry... */
 636         ret = -EPERM;
 637         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 638                 goto out;
 639
 640         ret = -EINVAL;
 641         if (ifindex < 0)
 642                 goto out;
 643
 644         /* Paired with all READ_ONCE() done locklessly. */
 645         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 646
 647         if (sk->sk_prot->rehash)
 648                 sk->sk_prot->rehash(sk);
 649         sk_dst_reset(sk);
 650
 651         ret = 0;
 652
 653 out:
 654 #endif
 655
 656         return ret;
 657 }
 658
 659 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 660 {
 661         int ret;
 662
 663         if (lock_sk)
 664                 lock_sock(sk);
 665         ret = sock_bindtoindex_locked(sk, ifindex);
 666         if (lock_sk)
 667                 release_sock(sk);
 668
 669         return ret;
 670 }
 671 EXPORT_SYMBOL(sock_bindtoindex);
 672
 673 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 674 {
 675         int ret = -ENOPROTOOPT;
 676 #ifdef CONFIG_NETDEVICES
 677         struct net *net = sock_net(sk);
 678         char devname[IFNAMSIZ];
 679         int index;
 680
 681         ret = -EINVAL;
 682         if (optlen < 0)
 683                 goto out;
 684
 685         /* Bind this socket to a particular device like "eth0",
 686          * as specified in the passed interface name. If the
 687          * name is "" or the option length is zero the socket
 688          * is not bound.
 689          */
 690         if (optlen > IFNAMSIZ - 1)
 691                 optlen = IFNAMSIZ - 1;
 692         memset(devname, 0, sizeof(devname));
 693
 694         ret = -EFAULT;
 695         if (copy_from_sockptr(devname, optval, optlen))
 696                 goto out;
 697
 698         index = 0;
 699         if (devname[0] != '\0') {
 700                 struct net_device *dev;
 701
 702                 rcu_read_lock();
 703                 dev = dev_get_by_name_rcu(net, devname);
 704                 if (dev)
 705                         index = dev->ifindex;
 706                 rcu_read_unlock();
 707                 ret = -ENODEV;
 708                 if (!dev)
 709                         goto out;
 710         }
 711
 712         sockopt_lock_sock(sk);
 713         ret = sock_bindtoindex_locked(sk, index);
 714         sockopt_release_sock(sk);
 715 out:
 716 #endif
 717
 718         return ret;
 719 }
 720
 721 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 722                                 sockptr_t optlen, int len)
 723 {
 724         int ret = -ENOPROTOOPT;
 725 #ifdef CONFIG_NETDEVICES
 726         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 727         struct net *net = sock_net(sk);
 728         char devname[IFNAMSIZ];
 729
 730         if (bound_dev_if == 0) {
 731                 len = 0;
 732                 goto zero;
 733         }
 734
 735         ret = -EINVAL;
 736         if (len < IFNAMSIZ)
 737                 goto out;
 738
 739         ret = netdev_get_name(net, devname, bound_dev_if);
 740         if (ret)
 741                 goto out;
 742
 743         len = strlen(devname) + 1;
 744
 745         ret = -EFAULT;
 746         if (copy_to_sockptr(optval, devname, len))
 747                 goto out;
 748
 749 zero:
 750         ret = -EFAULT;
 751         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 752                 goto out;
 753
 754         ret = 0;
 755
 756 out:
 757 #endif
 758
 759         return ret;
 760 }
 761
 762 bool sk_mc_loop(struct sock *sk)
 763 {
 764         if (dev_recursion_level())
 765                 return false;
 766         if (!sk)
 767                 return true;
 768         switch (sk->sk_family) {
 769         case AF_INET:
 770                 return inet_test_bit(MC_LOOP, sk);
 771 #if IS_ENABLED(CONFIG_IPV6)
 772         case AF_INET6:
 773                 return inet6_sk(sk)->mc_loop;
 774 #endif
 775         }
 776         WARN_ON_ONCE(1);
 777         return true;
 778 }
 779 EXPORT_SYMBOL(sk_mc_loop);
 780
 781 void sock_set_reuseaddr(struct sock *sk)
 782 {
 783         lock_sock(sk);
 784         sk->sk_reuse = SK_CAN_REUSE;
 785         release_sock(sk);
 786 }
 787 EXPORT_SYMBOL(sock_set_reuseaddr);
 788
 789 void sock_set_reuseport(struct sock *sk)
 790 {
 791         lock_sock(sk);
 792         sk->sk_reuseport = true;
 793         release_sock(sk);
 794 }
 795 EXPORT_SYMBOL(sock_set_reuseport);
 796
 797 void sock_no_linger(struct sock *sk)
 798 {
 799         lock_sock(sk);
 800         WRITE_ONCE(sk->sk_lingertime, 0);
 801         sock_set_flag(sk, SOCK_LINGER);
 802         release_sock(sk);
 803 }
 804 EXPORT_SYMBOL(sock_no_linger);
 805
 806 void sock_set_priority(struct sock *sk, u32 priority)
 807 {
 808         lock_sock(sk);
 809         WRITE_ONCE(sk->sk_priority, priority);
 810         release_sock(sk);
 811 }
 812 EXPORT_SYMBOL(sock_set_priority);
 813
 814 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 815 {
 816         lock_sock(sk);
 817         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 818                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 819         else
 820                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 821         release_sock(sk);
 822 }
 823 EXPORT_SYMBOL(sock_set_sndtimeo);
 824
 825 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 826 {
 827         if (val)  {
 828                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 829                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 830                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 831                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 832         } else {
 833                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 834                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 835         }
 836 }
 837
 838 void sock_enable_timestamps(struct sock *sk)
 839 {
 840         lock_sock(sk);
 841         __sock_set_timestamps(sk, true, false, true);
 842         release_sock(sk);
 843 }
 844 EXPORT_SYMBOL(sock_enable_timestamps);
 845
 846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 847 {
 848         switch (optname) {
 849         case SO_TIMESTAMP_OLD:
 850                 __sock_set_timestamps(sk, valbool, false, false);
 851                 break;
 852         case SO_TIMESTAMP_NEW:
 853                 __sock_set_timestamps(sk, valbool, true, false);
 854                 break;
 855         case SO_TIMESTAMPNS_OLD:
 856                 __sock_set_timestamps(sk, valbool, false, true);
 857                 break;
 858         case SO_TIMESTAMPNS_NEW:
 859                 __sock_set_timestamps(sk, valbool, true, true);
 860                 break;
 861         }
 862 }
 863
 864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 865 {
 866         struct net *net = sock_net(sk);
 867         struct net_device *dev = NULL;
 868         bool match = false;
 869         int *vclock_index;
 870         int i, num;
 871
 872         if (sk->sk_bound_dev_if)
 873                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 874
 875         if (!dev) {
 876                 pr_err("%s: sock not bind to device\n", __func__);
 877                 return -EOPNOTSUPP;
 878         }
 879
 880         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 881         dev_put(dev);
 882
 883         for (i = 0; i < num; i++) {
 884                 if (*(vclock_index + i) == phc_index) {
 885                         match = true;
 886                         break;
 887                 }
 888         }
 889
 890         if (num > 0)
 891                 kfree(vclock_index);
 892
 893         if (!match)
 894                 return -EINVAL;
 895
 896         sk->sk_bind_phc = phc_index;
 897
 898         return 0;
 899 }
 900
 901 int sock_set_timestamping(struct sock *sk, int optname,
 902                           struct so_timestamping timestamping)
 903 {
 904         int val = timestamping.flags;
 905         int ret;
 906
 907         if (val & ~SOF_TIMESTAMPING_MASK)
 908                 return -EINVAL;
 909
 910         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 911             !(val & SOF_TIMESTAMPING_OPT_ID))
 912                 return -EINVAL;
 913
 914         if (val & SOF_TIMESTAMPING_OPT_ID &&
 915             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 916                 if (sk_is_tcp(sk)) {
 917                         if ((1 << sk->sk_state) &
 918                             (TCPF_CLOSE | TCPF_LISTEN))
 919                                 return -EINVAL;
 920                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 921                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 922                         else
 923                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 924                 } else {
 925                         atomic_set(&sk->sk_tskey, 0);
 926                 }
 927         }
 928
 929         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 930             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 931                 return -EINVAL;
 932
 933         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 934                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 935                 if (ret)
 936                         return ret;
 937         }
 938
 939         sk->sk_tsflags = val;
 940         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 941
 942         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 943                 sock_enable_timestamp(sk,
 944                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 945         else
 946                 sock_disable_timestamp(sk,
 947                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 948         return 0;
 949 }
 950
 951 void sock_set_keepalive(struct sock *sk)
 952 {
 953         lock_sock(sk);
 954         if (sk->sk_prot->keepalive)
 955                 sk->sk_prot->keepalive(sk, true);
 956         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 957         release_sock(sk);
 958 }
 959 EXPORT_SYMBOL(sock_set_keepalive);
 960
 961 static void __sock_set_rcvbuf(struct sock *sk, int val)
 962 {
 963         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 964          * as a negative value.
 965          */
 966         val = min_t(int, val, INT_MAX / 2);
 967         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 968
 969         /* We double it on the way in to account for "struct sk_buff" etc.
 970          * overhead.   Applications assume that the SO_RCVBUF setting they make
 971          * will allow that much actual data to be received on that socket.
 972          *
 973          * Applications are unaware that "struct sk_buff" and other overheads
 974          * allocate from the receive buffer during socket buffer allocation.
 975          *
 976          * And after considering the possible alternatives, returning the value
 977          * we actually used in getsockopt is the most desirable behavior.
 978          */
 979         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 980 }
 981
 982 void sock_set_rcvbuf(struct sock *sk, int val)
 983 {
 984         lock_sock(sk);
 985         __sock_set_rcvbuf(sk, val);
 986         release_sock(sk);
 987 }
 988 EXPORT_SYMBOL(sock_set_rcvbuf);
 989
 990 static void __sock_set_mark(struct sock *sk, u32 val)
 991 {
 992         if (val != sk->sk_mark) {
 993                 WRITE_ONCE(sk->sk_mark, val);
 994                 sk_dst_reset(sk);
 995         }
 996 }
 997
 998 void sock_set_mark(struct sock *sk, u32 val)
 999 {
1000         lock_sock(sk);
1001         __sock_set_mark(sk, val);
1002         release_sock(sk);
1003 }
1004 EXPORT_SYMBOL(sock_set_mark);
1005
1006 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1007 {
1008         /* Round down bytes to multiple of pages */
1009         bytes = round_down(bytes, PAGE_SIZE);
1010
1011         WARN_ON(bytes > sk->sk_reserved_mem);
1012         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1013         sk_mem_reclaim(sk);
1014 }
1015
1016 static int sock_reserve_memory(struct sock *sk, int bytes)
1017 {
1018         long allocated;
1019         bool charged;
1020         int pages;
1021
1022         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1023                 return -EOPNOTSUPP;
1024
1025         if (!bytes)
1026                 return 0;
1027
1028         pages = sk_mem_pages(bytes);
1029
1030         /* pre-charge to memcg */
1031         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1032                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1033         if (!charged)
1034                 return -ENOMEM;
1035
1036         /* pre-charge to forward_alloc */
1037         sk_memory_allocated_add(sk, pages);
1038         allocated = sk_memory_allocated(sk);
1039         /* If the system goes into memory pressure with this
1040          * precharge, give up and return error.
1041          */
1042         if (allocated > sk_prot_mem_limits(sk, 1)) {
1043                 sk_memory_allocated_sub(sk, pages);
1044                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1045                 return -ENOMEM;
1046         }
1047         sk->sk_forward_alloc += pages << PAGE_SHIFT;
1048
1049         WRITE_ONCE(sk->sk_reserved_mem,
1050                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1051
1052         return 0;
1053 }
1054
1055 void sockopt_lock_sock(struct sock *sk)
1056 {
1057         /* When current->bpf_ctx is set, the setsockopt is called from
1058          * a bpf prog.  bpf has ensured the sk lock has been
1059          * acquired before calling setsockopt().
1060          */
1061         if (has_current_bpf_ctx())
1062                 return;
1063
1064         lock_sock(sk);
1065 }
1066 EXPORT_SYMBOL(sockopt_lock_sock);
1067
1068 void sockopt_release_sock(struct sock *sk)
1069 {
1070         if (has_current_bpf_ctx())
1071                 return;
1072
1073         release_sock(sk);
1074 }
1075 EXPORT_SYMBOL(sockopt_release_sock);
1076
1077 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1078 {
1079         return has_current_bpf_ctx() || ns_capable(ns, cap);
1080 }
1081 EXPORT_SYMBOL(sockopt_ns_capable);
1082
1083 bool sockopt_capable(int cap)
1084 {
1085         return has_current_bpf_ctx() || capable(cap);
1086 }
1087 EXPORT_SYMBOL(sockopt_capable);
1088
1089 /*
1090  *      This is meant for all protocols to use and covers goings on
1091  *      at the socket level. Everything here is generic.
1092  */
1093
1094 int sk_setsockopt(struct sock *sk, int level, int optname,
1095                   sockptr_t optval, unsigned int optlen)
1096 {
1097         struct so_timestamping timestamping;
1098         struct socket *sock = sk->sk_socket;
1099         struct sock_txtime sk_txtime;
1100         int val;
1101         int valbool;
1102         struct linger ling;
1103         int ret = 0;
1104
1105         /*
1106          *      Options without arguments
1107          */
1108
1109         if (optname == SO_BINDTODEVICE)
1110                 return sock_setbindtodevice(sk, optval, optlen);
1111
1112         if (optlen < sizeof(int))
1113                 return -EINVAL;
1114
1115         if (copy_from_sockptr(&val, optval, sizeof(val)))
1116                 return -EFAULT;
1117
1118         valbool = val ? 1 : 0;
1119
1120         sockopt_lock_sock(sk);
1121
1122         switch (optname) {
1123         case SO_DEBUG:
1124                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1125                         ret = -EACCES;
1126                 else
1127                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1128                 break;
1129         case SO_REUSEADDR:
1130                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1131                 break;
1132         case SO_REUSEPORT:
1133                 sk->sk_reuseport = valbool;
1134                 break;
1135         case SO_TYPE:
1136         case SO_PROTOCOL:
1137         case SO_DOMAIN:
1138         case SO_ERROR:
1139                 ret = -ENOPROTOOPT;
1140                 break;
1141         case SO_DONTROUTE:
1142                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1143                 sk_dst_reset(sk);
1144                 break;
1145         case SO_BROADCAST:
1146                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1147                 break;
1148         case SO_SNDBUF:
1149                 /* Don't error on this BSD doesn't and if you think
1150                  * about it this is right. Otherwise apps have to
1151                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1152                  * are treated in BSD as hints
1153                  */
1154                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1155 set_sndbuf:
1156                 /* Ensure val * 2 fits into an int, to prevent max_t()
1157                  * from treating it as a negative value.
1158                  */
1159                 val = min_t(int, val, INT_MAX / 2);
1160                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1161                 WRITE_ONCE(sk->sk_sndbuf,
1162                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1163                 /* Wake up sending tasks if we upped the value. */
1164                 sk->sk_write_space(sk);
1165                 break;
1166
1167         case SO_SNDBUFFORCE:
1168                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1169                         ret = -EPERM;
1170                         break;
1171                 }
1172
1173                 /* No negative values (to prevent underflow, as val will be
1174                  * multiplied by 2).
1175                  */
1176                 if (val < 0)
1177                         val = 0;
1178                 goto set_sndbuf;
1179
1180         case SO_RCVBUF:
1181                 /* Don't error on this BSD doesn't and if you think
1182                  * about it this is right. Otherwise apps have to
1183                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1184                  * are treated in BSD as hints
1185                  */
1186                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1187                 break;
1188
1189         case SO_RCVBUFFORCE:
1190                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1191                         ret = -EPERM;
1192                         break;
1193                 }
1194
1195                 /* No negative values (to prevent underflow, as val will be
1196                  * multiplied by 2).
1197                  */
1198                 __sock_set_rcvbuf(sk, max(val, 0));
1199                 break;
1200
1201         case SO_KEEPALIVE:
1202                 if (sk->sk_prot->keepalive)
1203                         sk->sk_prot->keepalive(sk, valbool);
1204                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1205                 break;
1206
1207         case SO_OOBINLINE:
1208                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1209                 break;
1210
1211         case SO_NO_CHECK:
1212                 sk->sk_no_check_tx = valbool;
1213                 break;
1214
1215         case SO_PRIORITY:
1216                 if ((val >= 0 && val <= 6) ||
1217                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1218                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1219                         WRITE_ONCE(sk->sk_priority, val);
1220                 else
1221                         ret = -EPERM;
1222                 break;
1223
1224         case SO_LINGER:
1225                 if (optlen < sizeof(ling)) {
1226                         ret = -EINVAL;  /* 1003.1g */
1227                         break;
1228                 }
1229                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1230                         ret = -EFAULT;
1231                         break;
1232                 }
1233                 if (!ling.l_onoff) {
1234                         sock_reset_flag(sk, SOCK_LINGER);
1235                 } else {
1236                         unsigned long t_sec = ling.l_linger;
1237
1238                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1239                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1240                         else
1241                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1242                         sock_set_flag(sk, SOCK_LINGER);
1243                 }
1244                 break;
1245
1246         case SO_BSDCOMPAT:
1247                 break;
1248
1249         case SO_PASSCRED:
1250                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1251                 break;
1252
1253         case SO_PASSPIDFD:
1254                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1255                 break;
1256
1257         case SO_TIMESTAMP_OLD:
1258         case SO_TIMESTAMP_NEW:
1259         case SO_TIMESTAMPNS_OLD:
1260         case SO_TIMESTAMPNS_NEW:
1261                 sock_set_timestamp(sk, optname, valbool);
1262                 break;
1263
1264         case SO_TIMESTAMPING_NEW:
1265         case SO_TIMESTAMPING_OLD:
1266                 if (optlen == sizeof(timestamping)) {
1267                         if (copy_from_sockptr(&timestamping, optval,
1268                                               sizeof(timestamping))) {
1269                                 ret = -EFAULT;
1270                                 break;
1271                         }
1272                 } else {
1273                         memset(&timestamping, 0, sizeof(timestamping));
1274                         timestamping.flags = val;
1275                 }
1276                 ret = sock_set_timestamping(sk, optname, timestamping);
1277                 break;
1278
1279         case SO_RCVLOWAT:
1280                 {
1281                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1282
1283                 if (val < 0)
1284                         val = INT_MAX;
1285                 if (sock)
1286                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1287                 if (set_rcvlowat)
1288                         ret = set_rcvlowat(sk, val);
1289                 else
1290                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1291                 break;
1292                 }
1293         case SO_RCVTIMEO_OLD:
1294         case SO_RCVTIMEO_NEW:
1295                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1296                                        optlen, optname == SO_RCVTIMEO_OLD);
1297                 break;
1298
1299         case SO_SNDTIMEO_OLD:
1300         case SO_SNDTIMEO_NEW:
1301                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1302                                        optlen, optname == SO_SNDTIMEO_OLD);
1303                 break;
1304
1305         case SO_ATTACH_FILTER: {
1306                 struct sock_fprog fprog;
1307
1308                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1309                 if (!ret)
1310                         ret = sk_attach_filter(&fprog, sk);
1311                 break;
1312         }
1313         case SO_ATTACH_BPF:
1314                 ret = -EINVAL;
1315                 if (optlen == sizeof(u32)) {
1316                         u32 ufd;
1317
1318                         ret = -EFAULT;
1319                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1320                                 break;
1321
1322                         ret = sk_attach_bpf(ufd, sk);
1323                 }
1324                 break;
1325
1326         case SO_ATTACH_REUSEPORT_CBPF: {
1327                 struct sock_fprog fprog;
1328
1329                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1330                 if (!ret)
1331                         ret = sk_reuseport_attach_filter(&fprog, sk);
1332                 break;
1333         }
1334         case SO_ATTACH_REUSEPORT_EBPF:
1335                 ret = -EINVAL;
1336                 if (optlen == sizeof(u32)) {
1337                         u32 ufd;
1338
1339                         ret = -EFAULT;
1340                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1341                                 break;
1342
1343                         ret = sk_reuseport_attach_bpf(ufd, sk);
1344                 }
1345                 break;
1346
1347         case SO_DETACH_REUSEPORT_BPF:
1348                 ret = reuseport_detach_prog(sk);
1349                 break;
1350
1351         case SO_DETACH_FILTER:
1352                 ret = sk_detach_filter(sk);
1353                 break;
1354
1355         case SO_LOCK_FILTER:
1356                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1357                         ret = -EPERM;
1358                 else
1359                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1360                 break;
1361
1362         case SO_PASSSEC:
1363                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1364                 break;
1365         case SO_MARK:
1366                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1367                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1368                         ret = -EPERM;
1369                         break;
1370                 }
1371
1372                 __sock_set_mark(sk, val);
1373                 break;
1374         case SO_RCVMARK:
1375                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1376                 break;
1377
1378         case SO_RXQ_OVFL:
1379                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1380                 break;
1381
1382         case SO_WIFI_STATUS:
1383                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1384                 break;
1385
1386         case SO_PEEK_OFF:
1387                 {
1388                 int (*set_peek_off)(struct sock *sk, int val);
1389
1390                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1391                 if (set_peek_off)
1392                         ret = set_peek_off(sk, val);
1393                 else
1394                         ret = -EOPNOTSUPP;
1395                 break;
1396                 }
1397
1398         case SO_NOFCS:
1399                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1400                 break;
1401
1402         case SO_SELECT_ERR_QUEUE:
1403                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1404                 break;
1405
1406 #ifdef CONFIG_NET_RX_BUSY_POLL
1407         case SO_BUSY_POLL:
1408                 if (val < 0)
1409                         ret = -EINVAL;
1410                 else
1411                         WRITE_ONCE(sk->sk_ll_usec, val);
1412                 break;
1413         case SO_PREFER_BUSY_POLL:
1414                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1415                         ret = -EPERM;
1416                 else
1417                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1418                 break;
1419         case SO_BUSY_POLL_BUDGET:
1420                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1421                         ret = -EPERM;
1422                 } else {
1423                         if (val < 0 || val > U16_MAX)
1424                                 ret = -EINVAL;
1425                         else
1426                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1427                 }
1428                 break;
1429 #endif
1430
1431         case SO_MAX_PACING_RATE:
1432                 {
1433                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1434
1435                 if (sizeof(ulval) != sizeof(val) &&
1436                     optlen >= sizeof(ulval) &&
1437                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1438                         ret = -EFAULT;
1439                         break;
1440                 }
1441                 if (ulval != ~0UL)
1442                         cmpxchg(&sk->sk_pacing_status,
1443                                 SK_PACING_NONE,
1444                                 SK_PACING_NEEDED);
1445                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1446                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1447                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1448                 break;
1449                 }
1450         case SO_INCOMING_CPU:
1451                 reuseport_update_incoming_cpu(sk, val);
1452                 break;
1453
1454         case SO_CNX_ADVICE:
1455                 if (val == 1)
1456                         dst_negative_advice(sk);
1457                 break;
1458
1459         case SO_ZEROCOPY:
1460                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1461                         if (!(sk_is_tcp(sk) ||
1462                               (sk->sk_type == SOCK_DGRAM &&
1463                                sk->sk_protocol == IPPROTO_UDP)))
1464                                 ret = -EOPNOTSUPP;
1465                 } else if (sk->sk_family != PF_RDS) {
1466                         ret = -EOPNOTSUPP;
1467                 }
1468                 if (!ret) {
1469                         if (val < 0 || val > 1)
1470                                 ret = -EINVAL;
1471                         else
1472                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1473                 }
1474                 break;
1475
1476         case SO_TXTIME:
1477                 if (optlen != sizeof(struct sock_txtime)) {
1478                         ret = -EINVAL;
1479                         break;
1480                 } else if (copy_from_sockptr(&sk_txtime, optval,
1481                            sizeof(struct sock_txtime))) {
1482                         ret = -EFAULT;
1483                         break;
1484                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1485                         ret = -EINVAL;
1486                         break;
1487                 }
1488                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1489                  * scheduler has enough safe guards.
1490                  */
1491                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1492                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1493                         ret = -EPERM;
1494                         break;
1495                 }
1496                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1497                 sk->sk_clockid = sk_txtime.clockid;
1498                 sk->sk_txtime_deadline_mode =
1499                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1500                 sk->sk_txtime_report_errors =
1501                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1502                 break;
1503
1504         case SO_BINDTOIFINDEX:
1505                 ret = sock_bindtoindex_locked(sk, val);
1506                 break;
1507
1508         case SO_BUF_LOCK:
1509                 if (val & ~SOCK_BUF_LOCK_MASK) {
1510                         ret = -EINVAL;
1511                         break;
1512                 }
1513                 sk->sk_userlocks = val | (sk->sk_userlocks &
1514                                           ~SOCK_BUF_LOCK_MASK);
1515                 break;
1516
1517         case SO_RESERVE_MEM:
1518         {
1519                 int delta;
1520
1521                 if (val < 0) {
1522                         ret = -EINVAL;
1523                         break;
1524                 }
1525
1526                 delta = val - sk->sk_reserved_mem;
1527                 if (delta < 0)
1528                         sock_release_reserved_memory(sk, -delta);
1529                 else
1530                         ret = sock_reserve_memory(sk, delta);
1531                 break;
1532         }
1533
1534         case SO_TXREHASH:
1535                 if (val < -1 || val > 1) {
1536                         ret = -EINVAL;
1537                         break;
1538                 }
1539                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1540                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1541                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1542                  * and sk_getsockopt().
1543                  */
1544                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1545                 break;
1546
1547         default:
1548                 ret = -ENOPROTOOPT;
1549                 break;
1550         }
1551         sockopt_release_sock(sk);
1552         return ret;
1553 }
1554
1555 int sock_setsockopt(struct socket *sock, int level, int optname,
1556                     sockptr_t optval, unsigned int optlen)
1557 {
1558         return sk_setsockopt(sock->sk, level, optname,
1559                              optval, optlen);
1560 }
1561 EXPORT_SYMBOL(sock_setsockopt);
1562
1563 static const struct cred *sk_get_peer_cred(struct sock *sk)
1564 {
1565         const struct cred *cred;
1566
1567         spin_lock(&sk->sk_peer_lock);
1568         cred = get_cred(sk->sk_peer_cred);
1569         spin_unlock(&sk->sk_peer_lock);
1570
1571         return cred;
1572 }
1573
1574 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1575                           struct ucred *ucred)
1576 {
1577         ucred->pid = pid_vnr(pid);
1578         ucred->uid = ucred->gid = -1;
1579         if (cred) {
1580                 struct user_namespace *current_ns = current_user_ns();
1581
1582                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1583                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1584         }
1585 }
1586
1587 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1588 {
1589         struct user_namespace *user_ns = current_user_ns();
1590         int i;
1591
1592         for (i = 0; i < src->ngroups; i++) {
1593                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1594
1595                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1596                         return -EFAULT;
1597         }
1598
1599         return 0;
1600 }
1601
1602 int sk_getsockopt(struct sock *sk, int level, int optname,
1603                   sockptr_t optval, sockptr_t optlen)
1604 {
1605         struct socket *sock = sk->sk_socket;
1606
1607         union {
1608                 int val;
1609                 u64 val64;
1610                 unsigned long ulval;
1611                 struct linger ling;
1612                 struct old_timeval32 tm32;
1613                 struct __kernel_old_timeval tm;
1614                 struct  __kernel_sock_timeval stm;
1615                 struct sock_txtime txtime;
1616                 struct so_timestamping timestamping;
1617         } v;
1618
1619         int lv = sizeof(int);
1620         int len;
1621
1622         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1623                 return -EFAULT;
1624         if (len < 0)
1625                 return -EINVAL;
1626
1627         memset(&v, 0, sizeof(v));
1628
1629         switch (optname) {
1630         case SO_DEBUG:
1631                 v.val = sock_flag(sk, SOCK_DBG);
1632                 break;
1633
1634         case SO_DONTROUTE:
1635                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1636                 break;
1637
1638         case SO_BROADCAST:
1639                 v.val = sock_flag(sk, SOCK_BROADCAST);
1640                 break;
1641
1642         case SO_SNDBUF:
1643                 v.val = READ_ONCE(sk->sk_sndbuf);
1644                 break;
1645
1646         case SO_RCVBUF:
1647                 v.val = READ_ONCE(sk->sk_rcvbuf);
1648                 break;
1649
1650         case SO_REUSEADDR:
1651                 v.val = sk->sk_reuse;
1652                 break;
1653
1654         case SO_REUSEPORT:
1655                 v.val = sk->sk_reuseport;
1656                 break;
1657
1658         case SO_KEEPALIVE:
1659                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1660                 break;
1661
1662         case SO_TYPE:
1663                 v.val = sk->sk_type;
1664                 break;
1665
1666         case SO_PROTOCOL:
1667                 v.val = sk->sk_protocol;
1668                 break;
1669
1670         case SO_DOMAIN:
1671                 v.val = sk->sk_family;
1672                 break;
1673
1674         case SO_ERROR:
1675                 v.val = -sock_error(sk);
1676                 if (v.val == 0)
1677                         v.val = xchg(&sk->sk_err_soft, 0);
1678                 break;
1679
1680         case SO_OOBINLINE:
1681                 v.val = sock_flag(sk, SOCK_URGINLINE);
1682                 break;
1683
1684         case SO_NO_CHECK:
1685                 v.val = sk->sk_no_check_tx;
1686                 break;
1687
1688         case SO_PRIORITY:
1689                 v.val = READ_ONCE(sk->sk_priority);
1690                 break;
1691
1692         case SO_LINGER:
1693                 lv              = sizeof(v.ling);
1694                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1695                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1696                 break;
1697
1698         case SO_BSDCOMPAT:
1699                 break;
1700
1701         case SO_TIMESTAMP_OLD:
1702                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1703                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1704                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1705                 break;
1706
1707         case SO_TIMESTAMPNS_OLD:
1708                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1709                 break;
1710
1711         case SO_TIMESTAMP_NEW:
1712                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1713                 break;
1714
1715         case SO_TIMESTAMPNS_NEW:
1716                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1717                 break;
1718
1719         case SO_TIMESTAMPING_OLD:
1720                 lv = sizeof(v.timestamping);
1721                 v.timestamping.flags = sk->sk_tsflags;
1722                 v.timestamping.bind_phc = sk->sk_bind_phc;
1723                 break;
1724
1725         case SO_RCVTIMEO_OLD:
1726         case SO_RCVTIMEO_NEW:
1727                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1728                                       SO_RCVTIMEO_OLD == optname);
1729                 break;
1730
1731         case SO_SNDTIMEO_OLD:
1732         case SO_SNDTIMEO_NEW:
1733                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1734                                       SO_SNDTIMEO_OLD == optname);
1735                 break;
1736
1737         case SO_RCVLOWAT:
1738                 v.val = READ_ONCE(sk->sk_rcvlowat);
1739                 break;
1740
1741         case SO_SNDLOWAT:
1742                 v.val = 1;
1743                 break;
1744
1745         case SO_PASSCRED:
1746                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1747                 break;
1748
1749         case SO_PASSPIDFD:
1750                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1751                 break;
1752
1753         case SO_PEERCRED:
1754         {
1755                 struct ucred peercred;
1756                 if (len > sizeof(peercred))
1757                         len = sizeof(peercred);
1758
1759                 spin_lock(&sk->sk_peer_lock);
1760                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1761                 spin_unlock(&sk->sk_peer_lock);
1762
1763                 if (copy_to_sockptr(optval, &peercred, len))
1764                         return -EFAULT;
1765                 goto lenout;
1766         }
1767
1768         case SO_PEERPIDFD:
1769         {
1770                 struct pid *peer_pid;
1771                 struct file *pidfd_file = NULL;
1772                 int pidfd;
1773
1774                 if (len > sizeof(pidfd))
1775                         len = sizeof(pidfd);
1776
1777                 spin_lock(&sk->sk_peer_lock);
1778                 peer_pid = get_pid(sk->sk_peer_pid);
1779                 spin_unlock(&sk->sk_peer_lock);
1780
1781                 if (!peer_pid)
1782                         return -ENODATA;
1783
1784                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1785                 put_pid(peer_pid);
1786                 if (pidfd < 0)
1787                         return pidfd;
1788
1789                 if (copy_to_sockptr(optval, &pidfd, len) ||
1790                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1791                         put_unused_fd(pidfd);
1792                         fput(pidfd_file);
1793
1794                         return -EFAULT;
1795                 }
1796
1797                 fd_install(pidfd, pidfd_file);
1798                 return 0;
1799         }
1800
1801         case SO_PEERGROUPS:
1802         {
1803                 const struct cred *cred;
1804                 int ret, n;
1805
1806                 cred = sk_get_peer_cred(sk);
1807                 if (!cred)
1808                         return -ENODATA;
1809
1810                 n = cred->group_info->ngroups;
1811                 if (len < n * sizeof(gid_t)) {
1812                         len = n * sizeof(gid_t);
1813                         put_cred(cred);
1814                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1815                 }
1816                 len = n * sizeof(gid_t);
1817
1818                 ret = groups_to_user(optval, cred->group_info);
1819                 put_cred(cred);
1820                 if (ret)
1821                         return ret;
1822                 goto lenout;
1823         }
1824
1825         case SO_PEERNAME:
1826         {
1827                 struct sockaddr_storage address;
1828
1829                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1830                 if (lv < 0)
1831                         return -ENOTCONN;
1832                 if (lv < len)
1833                         return -EINVAL;
1834                 if (copy_to_sockptr(optval, &address, len))
1835                         return -EFAULT;
1836                 goto lenout;
1837         }
1838
1839         /* Dubious BSD thing... Probably nobody even uses it, but
1840          * the UNIX standard wants it for whatever reason... -DaveM
1841          */
1842         case SO_ACCEPTCONN:
1843                 v.val = sk->sk_state == TCP_LISTEN;
1844                 break;
1845
1846         case SO_PASSSEC:
1847                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1848                 break;
1849
1850         case SO_PEERSEC:
1851                 return security_socket_getpeersec_stream(sock,
1852                                                          optval, optlen, len);
1853
1854         case SO_MARK:
1855                 v.val = READ_ONCE(sk->sk_mark);
1856                 break;
1857
1858         case SO_RCVMARK:
1859                 v.val = sock_flag(sk, SOCK_RCVMARK);
1860                 break;
1861
1862         case SO_RXQ_OVFL:
1863                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1864                 break;
1865
1866         case SO_WIFI_STATUS:
1867                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1868                 break;
1869
1870         case SO_PEEK_OFF:
1871                 if (!READ_ONCE(sock->ops)->set_peek_off)
1872                         return -EOPNOTSUPP;
1873
1874                 v.val = READ_ONCE(sk->sk_peek_off);
1875                 break;
1876         case SO_NOFCS:
1877                 v.val = sock_flag(sk, SOCK_NOFCS);
1878                 break;
1879
1880         case SO_BINDTODEVICE:
1881                 return sock_getbindtodevice(sk, optval, optlen, len);
1882
1883         case SO_GET_FILTER:
1884                 len = sk_get_filter(sk, optval, len);
1885                 if (len < 0)
1886                         return len;
1887
1888                 goto lenout;
1889
1890         case SO_LOCK_FILTER:
1891                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1892                 break;
1893
1894         case SO_BPF_EXTENSIONS:
1895                 v.val = bpf_tell_extensions();
1896                 break;
1897
1898         case SO_SELECT_ERR_QUEUE:
1899                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1900                 break;
1901
1902 #ifdef CONFIG_NET_RX_BUSY_POLL
1903         case SO_BUSY_POLL:
1904                 v.val = READ_ONCE(sk->sk_ll_usec);
1905                 break;
1906         case SO_PREFER_BUSY_POLL:
1907                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1908                 break;
1909 #endif
1910
1911         case SO_MAX_PACING_RATE:
1912                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1913                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1914                         lv = sizeof(v.ulval);
1915                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1916                 } else {
1917                         /* 32bit version */
1918                         v.val = min_t(unsigned long, ~0U,
1919                                       READ_ONCE(sk->sk_max_pacing_rate));
1920                 }
1921                 break;
1922
1923         case SO_INCOMING_CPU:
1924                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1925                 break;
1926
1927         case SO_MEMINFO:
1928         {
1929                 u32 meminfo[SK_MEMINFO_VARS];
1930
1931                 sk_get_meminfo(sk, meminfo);
1932
1933                 len = min_t(unsigned int, len, sizeof(meminfo));
1934                 if (copy_to_sockptr(optval, &meminfo, len))
1935                         return -EFAULT;
1936
1937                 goto lenout;
1938         }
1939
1940 #ifdef CONFIG_NET_RX_BUSY_POLL
1941         case SO_INCOMING_NAPI_ID:
1942                 v.val = READ_ONCE(sk->sk_napi_id);
1943
1944                 /* aggregate non-NAPI IDs down to 0 */
1945                 if (v.val < MIN_NAPI_ID)
1946                         v.val = 0;
1947
1948                 break;
1949 #endif
1950
1951         case SO_COOKIE:
1952                 lv = sizeof(u64);
1953                 if (len < lv)
1954                         return -EINVAL;
1955                 v.val64 = sock_gen_cookie(sk);
1956                 break;
1957
1958         case SO_ZEROCOPY:
1959                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1960                 break;
1961
1962         case SO_TXTIME:
1963                 lv = sizeof(v.txtime);
1964                 v.txtime.clockid = sk->sk_clockid;
1965                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1966                                   SOF_TXTIME_DEADLINE_MODE : 0;
1967                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1968                                   SOF_TXTIME_REPORT_ERRORS : 0;
1969                 break;
1970
1971         case SO_BINDTOIFINDEX:
1972                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1973                 break;
1974
1975         case SO_NETNS_COOKIE:
1976                 lv = sizeof(u64);
1977                 if (len != lv)
1978                         return -EINVAL;
1979                 v.val64 = sock_net(sk)->net_cookie;
1980                 break;
1981
1982         case SO_BUF_LOCK:
1983                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1984                 break;
1985
1986         case SO_RESERVE_MEM:
1987                 v.val = READ_ONCE(sk->sk_reserved_mem);
1988                 break;
1989
1990         case SO_TXREHASH:
1991                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
1992                 v.val = READ_ONCE(sk->sk_txrehash);
1993                 break;
1994
1995         default:
1996                 /* We implement the SO_SNDLOWAT etc to not be settable
1997                  * (1003.1g 7).
1998                  */
1999                 return -ENOPROTOOPT;
2000         }
2001
2002         if (len > lv)
2003                 len = lv;
2004         if (copy_to_sockptr(optval, &v, len))
2005                 return -EFAULT;
2006 lenout:
2007         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2008                 return -EFAULT;
2009         return 0;
2010 }
2011
2012 int sock_getsockopt(struct socket *sock, int level, int optname,
2013                     char __user *optval, int __user *optlen)
2014 {
2015         return sk_getsockopt(sock->sk, level, optname,
2016                              USER_SOCKPTR(optval),
2017                              USER_SOCKPTR(optlen));
2018 }
2019
2020 /*
2021  * Initialize an sk_lock.
2022  *
2023  * (We also register the sk_lock with the lock validator.)
2024  */
2025 static inline void sock_lock_init(struct sock *sk)
2026 {
2027         if (sk->sk_kern_sock)
2028                 sock_lock_init_class_and_name(
2029                         sk,
2030                         af_family_kern_slock_key_strings[sk->sk_family],
2031                         af_family_kern_slock_keys + sk->sk_family,
2032                         af_family_kern_key_strings[sk->sk_family],
2033                         af_family_kern_keys + sk->sk_family);
2034         else
2035                 sock_lock_init_class_and_name(
2036                         sk,
2037                         af_family_slock_key_strings[sk->sk_family],
2038                         af_family_slock_keys + sk->sk_family,
2039                         af_family_key_strings[sk->sk_family],
2040                         af_family_keys + sk->sk_family);
2041 }
2042
2043 /*
2044  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2045  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2046  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2047  */
2048 static void sock_copy(struct sock *nsk, const struct sock *osk)
2049 {
2050         const struct proto *prot = READ_ONCE(osk->sk_prot);
2051 #ifdef CONFIG_SECURITY_NETWORK
2052         void *sptr = nsk->sk_security;
2053 #endif
2054
2055         /* If we move sk_tx_queue_mapping out of the private section,
2056          * we must check if sk_tx_queue_clear() is called after
2057          * sock_copy() in sk_clone_lock().
2058          */
2059         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2060                      offsetof(struct sock, sk_dontcopy_begin) ||
2061                      offsetof(struct sock, sk_tx_queue_mapping) >=
2062                      offsetof(struct sock, sk_dontcopy_end));
2063
2064         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2065
2066         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2067                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2068
2069 #ifdef CONFIG_SECURITY_NETWORK
2070         nsk->sk_security = sptr;
2071         security_sk_clone(osk, nsk);
2072 #endif
2073 }
2074
2075 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2076                 int family)
2077 {
2078         struct sock *sk;
2079         struct kmem_cache *slab;
2080
2081         slab = prot->slab;
2082         if (slab != NULL) {
2083                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2084                 if (!sk)
2085                         return sk;
2086                 if (want_init_on_alloc(priority))
2087                         sk_prot_clear_nulls(sk, prot->obj_size);
2088         } else
2089                 sk = kmalloc(prot->obj_size, priority);
2090
2091         if (sk != NULL) {
2092                 if (security_sk_alloc(sk, family, priority))
2093                         goto out_free;
2094
2095                 if (!try_module_get(prot->owner))
2096                         goto out_free_sec;
2097         }
2098
2099         return sk;
2100
2101 out_free_sec:
2102         security_sk_free(sk);
2103 out_free:
2104         if (slab != NULL)
2105                 kmem_cache_free(slab, sk);
2106         else
2107                 kfree(sk);
2108         return NULL;
2109 }
2110
2111 static void sk_prot_free(struct proto *prot, struct sock *sk)
2112 {
2113         struct kmem_cache *slab;
2114         struct module *owner;
2115
2116         owner = prot->owner;
2117         slab = prot->slab;
2118
2119         cgroup_sk_free(&sk->sk_cgrp_data);
2120         mem_cgroup_sk_free(sk);
2121         security_sk_free(sk);
2122         if (slab != NULL)
2123                 kmem_cache_free(slab, sk);
2124         else
2125                 kfree(sk);
2126         module_put(owner);
2127 }
2128
2129 /**
2130  *      sk_alloc - All socket objects are allocated here
2131  *      @net: the applicable net namespace
2132  *      @family: protocol family
2133  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2134  *      @prot: struct proto associated with this new sock instance
2135  *      @kern: is this to be a kernel socket?
2136  */
2137 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2138                       struct proto *prot, int kern)
2139 {
2140         struct sock *sk;
2141
2142         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2143         if (sk) {
2144                 sk->sk_family = family;
2145                 /*
2146                  * See comment in struct sock definition to understand
2147                  * why we need sk_prot_creator -acme
2148                  */
2149                 sk->sk_prot = sk->sk_prot_creator = prot;
2150                 sk->sk_kern_sock = kern;
2151                 sock_lock_init(sk);
2152                 sk->sk_net_refcnt = kern ? 0 : 1;
2153                 if (likely(sk->sk_net_refcnt)) {
2154                         get_net_track(net, &sk->ns_tracker, priority);
2155                         sock_inuse_add(net, 1);
2156                 } else {
2157                         __netns_tracker_alloc(net, &sk->ns_tracker,
2158                                               false, priority);
2159                 }
2160
2161                 sock_net_set(sk, net);
2162                 refcount_set(&sk->sk_wmem_alloc, 1);
2163
2164                 mem_cgroup_sk_alloc(sk);
2165                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2166                 sock_update_classid(&sk->sk_cgrp_data);
2167                 sock_update_netprioidx(&sk->sk_cgrp_data);
2168                 sk_tx_queue_clear(sk);
2169         }
2170
2171         return sk;
2172 }
2173 EXPORT_SYMBOL(sk_alloc);
2174
2175 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2176  * grace period. This is the case for UDP sockets and TCP listeners.
2177  */
2178 static void __sk_destruct(struct rcu_head *head)
2179 {
2180         struct sock *sk = container_of(head, struct sock, sk_rcu);
2181         struct sk_filter *filter;
2182
2183         if (sk->sk_destruct)
2184                 sk->sk_destruct(sk);
2185
2186         filter = rcu_dereference_check(sk->sk_filter,
2187                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2188         if (filter) {
2189                 sk_filter_uncharge(sk, filter);
2190                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2191         }
2192
2193         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2194
2195 #ifdef CONFIG_BPF_SYSCALL
2196         bpf_sk_storage_free(sk);
2197 #endif
2198
2199         if (atomic_read(&sk->sk_omem_alloc))
2200                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2201                          __func__, atomic_read(&sk->sk_omem_alloc));
2202
2203         if (sk->sk_frag.page) {
2204                 put_page(sk->sk_frag.page);
2205                 sk->sk_frag.page = NULL;
2206         }
2207
2208         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2209         put_cred(sk->sk_peer_cred);
2210         put_pid(sk->sk_peer_pid);
2211
2212         if (likely(sk->sk_net_refcnt))
2213                 put_net_track(sock_net(sk), &sk->ns_tracker);
2214         else
2215                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2216
2217         sk_prot_free(sk->sk_prot_creator, sk);
2218 }
2219
2220 void sk_destruct(struct sock *sk)
2221 {
2222         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2223
2224         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2225                 reuseport_detach_sock(sk);
2226                 use_call_rcu = true;
2227         }
2228
2229         if (use_call_rcu)
2230                 call_rcu(&sk->sk_rcu, __sk_destruct);
2231         else
2232                 __sk_destruct(&sk->sk_rcu);
2233 }
2234
2235 static void __sk_free(struct sock *sk)
2236 {
2237         if (likely(sk->sk_net_refcnt))
2238                 sock_inuse_add(sock_net(sk), -1);
2239
2240         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2241                 sock_diag_broadcast_destroy(sk);
2242         else
2243                 sk_destruct(sk);
2244 }
2245
2246 void sk_free(struct sock *sk)
2247 {
2248         /*
2249          * We subtract one from sk_wmem_alloc and can know if
2250          * some packets are still in some tx queue.
2251          * If not null, sock_wfree() will call __sk_free(sk) later
2252          */
2253         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2254                 __sk_free(sk);
2255 }
2256 EXPORT_SYMBOL(sk_free);
2257
2258 static void sk_init_common(struct sock *sk)
2259 {
2260         skb_queue_head_init(&sk->sk_receive_queue);
2261         skb_queue_head_init(&sk->sk_write_queue);
2262         skb_queue_head_init(&sk->sk_error_queue);
2263
2264         rwlock_init(&sk->sk_callback_lock);
2265         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2266                         af_rlock_keys + sk->sk_family,
2267                         af_family_rlock_key_strings[sk->sk_family]);
2268         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2269                         af_wlock_keys + sk->sk_family,
2270                         af_family_wlock_key_strings[sk->sk_family]);
2271         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2272                         af_elock_keys + sk->sk_family,
2273                         af_family_elock_key_strings[sk->sk_family]);
2274         lockdep_set_class_and_name(&sk->sk_callback_lock,
2275                         af_callback_keys + sk->sk_family,
2276                         af_family_clock_key_strings[sk->sk_family]);
2277 }
2278
2279 /**
2280  *      sk_clone_lock - clone a socket, and lock its clone
2281  *      @sk: the socket to clone
2282  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2283  *
2284  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2285  */
2286 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2287 {
2288         struct proto *prot = READ_ONCE(sk->sk_prot);
2289         struct sk_filter *filter;
2290         bool is_charged = true;
2291         struct sock *newsk;
2292
2293         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2294         if (!newsk)
2295                 goto out;
2296
2297         sock_copy(newsk, sk);
2298
2299         newsk->sk_prot_creator = prot;
2300
2301         /* SANITY */
2302         if (likely(newsk->sk_net_refcnt)) {
2303                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2304                 sock_inuse_add(sock_net(newsk), 1);
2305         } else {
2306                 /* Kernel sockets are not elevating the struct net refcount.
2307                  * Instead, use a tracker to more easily detect if a layer
2308                  * is not properly dismantling its kernel sockets at netns
2309                  * destroy time.
2310                  */
2311                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2312                                       false, priority);
2313         }
2314         sk_node_init(&newsk->sk_node);
2315         sock_lock_init(newsk);
2316         bh_lock_sock(newsk);
2317         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2318         newsk->sk_backlog.len = 0;
2319
2320         atomic_set(&newsk->sk_rmem_alloc, 0);
2321
2322         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2323         refcount_set(&newsk->sk_wmem_alloc, 1);
2324
2325         atomic_set(&newsk->sk_omem_alloc, 0);
2326         sk_init_common(newsk);
2327
2328         newsk->sk_dst_cache     = NULL;
2329         newsk->sk_dst_pending_confirm = 0;
2330         newsk->sk_wmem_queued   = 0;
2331         newsk->sk_forward_alloc = 0;
2332         newsk->sk_reserved_mem  = 0;
2333         atomic_set(&newsk->sk_drops, 0);
2334         newsk->sk_send_head     = NULL;
2335         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2336         atomic_set(&newsk->sk_zckey, 0);
2337
2338         sock_reset_flag(newsk, SOCK_DONE);
2339
2340         /* sk->sk_memcg will be populated at accept() time */
2341         newsk->sk_memcg = NULL;
2342
2343         cgroup_sk_clone(&newsk->sk_cgrp_data);
2344
2345         rcu_read_lock();
2346         filter = rcu_dereference(sk->sk_filter);
2347         if (filter != NULL)
2348                 /* though it's an empty new sock, the charging may fail
2349                  * if sysctl_optmem_max was changed between creation of
2350                  * original socket and cloning
2351                  */
2352                 is_charged = sk_filter_charge(newsk, filter);
2353         RCU_INIT_POINTER(newsk->sk_filter, filter);
2354         rcu_read_unlock();
2355
2356         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2357                 /* We need to make sure that we don't uncharge the new
2358                  * socket if we couldn't charge it in the first place
2359                  * as otherwise we uncharge the parent's filter.
2360                  */
2361                 if (!is_charged)
2362                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2363                 sk_free_unlock_clone(newsk);
2364                 newsk = NULL;
2365                 goto out;
2366         }
2367         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2368
2369         if (bpf_sk_storage_clone(sk, newsk)) {
2370                 sk_free_unlock_clone(newsk);
2371                 newsk = NULL;
2372                 goto out;
2373         }
2374
2375         /* Clear sk_user_data if parent had the pointer tagged
2376          * as not suitable for copying when cloning.
2377          */
2378         if (sk_user_data_is_nocopy(newsk))
2379                 newsk->sk_user_data = NULL;
2380
2381         newsk->sk_err      = 0;
2382         newsk->sk_err_soft = 0;
2383         newsk->sk_priority = 0;
2384         newsk->sk_incoming_cpu = raw_smp_processor_id();
2385
2386         /* Before updating sk_refcnt, we must commit prior changes to memory
2387          * (Documentation/RCU/rculist_nulls.rst for details)
2388          */
2389         smp_wmb();
2390         refcount_set(&newsk->sk_refcnt, 2);
2391
2392         sk_set_socket(newsk, NULL);
2393         sk_tx_queue_clear(newsk);
2394         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2395
2396         if (newsk->sk_prot->sockets_allocated)
2397                 sk_sockets_allocated_inc(newsk);
2398
2399         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2400                 net_enable_timestamp();
2401 out:
2402         return newsk;
2403 }
2404 EXPORT_SYMBOL_GPL(sk_clone_lock);
2405
2406 void sk_free_unlock_clone(struct sock *sk)
2407 {
2408         /* It is still raw copy of parent, so invalidate
2409          * destructor and make plain sk_free() */
2410         sk->sk_destruct = NULL;
2411         bh_unlock_sock(sk);
2412         sk_free(sk);
2413 }
2414 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2415
2416 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2417 {
2418         bool is_ipv6 = false;
2419         u32 max_size;
2420
2421 #if IS_ENABLED(CONFIG_IPV6)
2422         is_ipv6 = (sk->sk_family == AF_INET6 &&
2423                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2424 #endif
2425         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2426         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2427                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2428         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2429                 max_size = GSO_LEGACY_MAX_SIZE;
2430
2431         return max_size - (MAX_TCP_HEADER + 1);
2432 }
2433
2434 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2435 {
2436         u32 max_segs = 1;
2437
2438         sk->sk_route_caps = dst->dev->features;
2439         if (sk_is_tcp(sk))
2440                 sk->sk_route_caps |= NETIF_F_GSO;
2441         if (sk->sk_route_caps & NETIF_F_GSO)
2442                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2443         if (unlikely(sk->sk_gso_disabled))
2444                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2445         if (sk_can_gso(sk)) {
2446                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2447                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2448                 } else {
2449                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2450                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2451                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2452                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2453                 }
2454         }
2455         sk->sk_gso_max_segs = max_segs;
2456         sk_dst_set(sk, dst);
2457 }
2458 EXPORT_SYMBOL_GPL(sk_setup_caps);
2459
2460 /*
2461  *      Simple resource managers for sockets.
2462  */
2463
2464
2465 /*
2466  * Write buffer destructor automatically called from kfree_skb.
2467  */
2468 void sock_wfree(struct sk_buff *skb)
2469 {
2470         struct sock *sk = skb->sk;
2471         unsigned int len = skb->truesize;
2472         bool free;
2473
2474         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2475                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2476                     sk->sk_write_space == sock_def_write_space) {
2477                         rcu_read_lock();
2478                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2479                         sock_def_write_space_wfree(sk);
2480                         rcu_read_unlock();
2481                         if (unlikely(free))
2482                                 __sk_free(sk);
2483                         return;
2484                 }
2485
2486                 /*
2487                  * Keep a reference on sk_wmem_alloc, this will be released
2488                  * after sk_write_space() call
2489                  */
2490                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2491                 sk->sk_write_space(sk);
2492                 len = 1;
2493         }
2494         /*
2495          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2496          * could not do because of in-flight packets
2497          */
2498         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2499                 __sk_free(sk);
2500 }
2501 EXPORT_SYMBOL(sock_wfree);
2502
2503 /* This variant of sock_wfree() is used by TCP,
2504  * since it sets SOCK_USE_WRITE_QUEUE.
2505  */
2506 void __sock_wfree(struct sk_buff *skb)
2507 {
2508         struct sock *sk = skb->sk;
2509
2510         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2511                 __sk_free(sk);
2512 }
2513
2514 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2515 {
2516         skb_orphan(skb);
2517         skb->sk = sk;
2518 #ifdef CONFIG_INET
2519         if (unlikely(!sk_fullsock(sk))) {
2520                 skb->destructor = sock_edemux;
2521                 sock_hold(sk);
2522                 return;
2523         }
2524 #endif
2525         skb->destructor = sock_wfree;
2526         skb_set_hash_from_sk(skb, sk);
2527         /*
2528          * We used to take a refcount on sk, but following operation
2529          * is enough to guarantee sk_free() wont free this sock until
2530          * all in-flight packets are completed
2531          */
2532         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2533 }
2534 EXPORT_SYMBOL(skb_set_owner_w);
2535
2536 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2537 {
2538 #ifdef CONFIG_TLS_DEVICE
2539         /* Drivers depend on in-order delivery for crypto offload,
2540          * partial orphan breaks out-of-order-OK logic.
2541          */
2542         if (skb->decrypted)
2543                 return false;
2544 #endif
2545         return (skb->destructor == sock_wfree ||
2546                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2547 }
2548
2549 /* This helper is used by netem, as it can hold packets in its
2550  * delay queue. We want to allow the owner socket to send more
2551  * packets, as if they were already TX completed by a typical driver.
2552  * But we also want to keep skb->sk set because some packet schedulers
2553  * rely on it (sch_fq for example).
2554  */
2555 void skb_orphan_partial(struct sk_buff *skb)
2556 {
2557         if (skb_is_tcp_pure_ack(skb))
2558                 return;
2559
2560         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2561                 return;
2562
2563         skb_orphan(skb);
2564 }
2565 EXPORT_SYMBOL(skb_orphan_partial);
2566
2567 /*
2568  * Read buffer destructor automatically called from kfree_skb.
2569  */
2570 void sock_rfree(struct sk_buff *skb)
2571 {
2572         struct sock *sk = skb->sk;
2573         unsigned int len = skb->truesize;
2574
2575         atomic_sub(len, &sk->sk_rmem_alloc);
2576         sk_mem_uncharge(sk, len);
2577 }
2578 EXPORT_SYMBOL(sock_rfree);
2579
2580 /*
2581  * Buffer destructor for skbs that are not used directly in read or write
2582  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2583  */
2584 void sock_efree(struct sk_buff *skb)
2585 {
2586         sock_put(skb->sk);
2587 }
2588 EXPORT_SYMBOL(sock_efree);
2589
2590 /* Buffer destructor for prefetch/receive path where reference count may
2591  * not be held, e.g. for listen sockets.
2592  */
2593 #ifdef CONFIG_INET
2594 void sock_pfree(struct sk_buff *skb)
2595 {
2596         if (sk_is_refcounted(skb->sk))
2597                 sock_gen_put(skb->sk);
2598 }
2599 EXPORT_SYMBOL(sock_pfree);
2600 #endif /* CONFIG_INET */
2601
2602 kuid_t sock_i_uid(struct sock *sk)
2603 {
2604         kuid_t uid;
2605
2606         read_lock_bh(&sk->sk_callback_lock);
2607         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2608         read_unlock_bh(&sk->sk_callback_lock);
2609         return uid;
2610 }
2611 EXPORT_SYMBOL(sock_i_uid);
2612
2613 unsigned long __sock_i_ino(struct sock *sk)
2614 {
2615         unsigned long ino;
2616
2617         read_lock(&sk->sk_callback_lock);
2618         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2619         read_unlock(&sk->sk_callback_lock);
2620         return ino;
2621 }
2622 EXPORT_SYMBOL(__sock_i_ino);
2623
2624 unsigned long sock_i_ino(struct sock *sk)
2625 {
2626         unsigned long ino;
2627
2628         local_bh_disable();
2629         ino = __sock_i_ino(sk);
2630         local_bh_enable();
2631         return ino;
2632 }
2633 EXPORT_SYMBOL(sock_i_ino);
2634
2635 /*
2636  * Allocate a skb from the socket's send buffer.
2637  */
2638 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2639                              gfp_t priority)
2640 {
2641         if (force ||
2642             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2643                 struct sk_buff *skb = alloc_skb(size, priority);
2644
2645                 if (skb) {
2646                         skb_set_owner_w(skb, sk);
2647                         return skb;
2648                 }
2649         }
2650         return NULL;
2651 }
2652 EXPORT_SYMBOL(sock_wmalloc);
2653
2654 static void sock_ofree(struct sk_buff *skb)
2655 {
2656         struct sock *sk = skb->sk;
2657
2658         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2659 }
2660
2661 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2662                              gfp_t priority)
2663 {
2664         struct sk_buff *skb;
2665
2666         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2667         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2668             READ_ONCE(sysctl_optmem_max))
2669                 return NULL;
2670
2671         skb = alloc_skb(size, priority);
2672         if (!skb)
2673                 return NULL;
2674
2675         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2676         skb->sk = sk;
2677         skb->destructor = sock_ofree;
2678         return skb;
2679 }
2680
2681 /*
2682  * Allocate a memory block from the socket's option memory buffer.
2683  */
2684 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2685 {
2686         int optmem_max = READ_ONCE(sysctl_optmem_max);
2687
2688         if ((unsigned int)size <= optmem_max &&
2689             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2690                 void *mem;
2691                 /* First do the add, to avoid the race if kmalloc
2692                  * might sleep.
2693                  */
2694                 atomic_add(size, &sk->sk_omem_alloc);
2695                 mem = kmalloc(size, priority);
2696                 if (mem)
2697                         return mem;
2698                 atomic_sub(size, &sk->sk_omem_alloc);
2699         }
2700         return NULL;
2701 }
2702 EXPORT_SYMBOL(sock_kmalloc);
2703
2704 /* Free an option memory block. Note, we actually want the inline
2705  * here as this allows gcc to detect the nullify and fold away the
2706  * condition entirely.
2707  */
2708 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2709                                   const bool nullify)
2710 {
2711         if (WARN_ON_ONCE(!mem))
2712                 return;
2713         if (nullify)
2714                 kfree_sensitive(mem);
2715         else
2716                 kfree(mem);
2717         atomic_sub(size, &sk->sk_omem_alloc);
2718 }
2719
2720 void sock_kfree_s(struct sock *sk, void *mem, int size)
2721 {
2722         __sock_kfree_s(sk, mem, size, false);
2723 }
2724 EXPORT_SYMBOL(sock_kfree_s);
2725
2726 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2727 {
2728         __sock_kfree_s(sk, mem, size, true);
2729 }
2730 EXPORT_SYMBOL(sock_kzfree_s);
2731
2732 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2733    I think, these locks should be removed for datagram sockets.
2734  */
2735 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2736 {
2737         DEFINE_WAIT(wait);
2738
2739         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2740         for (;;) {
2741                 if (!timeo)
2742                         break;
2743                 if (signal_pending(current))
2744                         break;
2745                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2746                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2747                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2748                         break;
2749                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2750                         break;
2751                 if (sk->sk_err)
2752                         break;
2753                 timeo = schedule_timeout(timeo);
2754         }
2755         finish_wait(sk_sleep(sk), &wait);
2756         return timeo;
2757 }
2758
2759
2760 /*
2761  *      Generic send/receive buffer handlers
2762  */
2763
2764 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2765                                      unsigned long data_len, int noblock,
2766                                      int *errcode, int max_page_order)
2767 {
2768         struct sk_buff *skb;
2769         long timeo;
2770         int err;
2771
2772         timeo = sock_sndtimeo(sk, noblock);
2773         for (;;) {
2774                 err = sock_error(sk);
2775                 if (err != 0)
2776                         goto failure;
2777
2778                 err = -EPIPE;
2779                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2780                         goto failure;
2781
2782                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2783                         break;
2784
2785                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2786                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2787                 err = -EAGAIN;
2788                 if (!timeo)
2789                         goto failure;
2790                 if (signal_pending(current))
2791                         goto interrupted;
2792                 timeo = sock_wait_for_wmem(sk, timeo);
2793         }
2794         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2795                                    errcode, sk->sk_allocation);
2796         if (skb)
2797                 skb_set_owner_w(skb, sk);
2798         return skb;
2799
2800 interrupted:
2801         err = sock_intr_errno(timeo);
2802 failure:
2803         *errcode = err;
2804         return NULL;
2805 }
2806 EXPORT_SYMBOL(sock_alloc_send_pskb);
2807
2808 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2809                      struct sockcm_cookie *sockc)
2810 {
2811         u32 tsflags;
2812
2813         switch (cmsg->cmsg_type) {
2814         case SO_MARK:
2815                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2816                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2817                         return -EPERM;
2818                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2819                         return -EINVAL;
2820                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2821                 break;
2822         case SO_TIMESTAMPING_OLD:
2823                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2824                         return -EINVAL;
2825
2826                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2827                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2828                         return -EINVAL;
2829
2830                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2831                 sockc->tsflags |= tsflags;
2832                 break;
2833         case SCM_TXTIME:
2834                 if (!sock_flag(sk, SOCK_TXTIME))
2835                         return -EINVAL;
2836                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2837                         return -EINVAL;
2838                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2839                 break;
2840         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2841         case SCM_RIGHTS:
2842         case SCM_CREDENTIALS:
2843                 break;
2844         default:
2845                 return -EINVAL;
2846         }
2847         return 0;
2848 }
2849 EXPORT_SYMBOL(__sock_cmsg_send);
2850
2851 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2852                    struct sockcm_cookie *sockc)
2853 {
2854         struct cmsghdr *cmsg;
2855         int ret;
2856
2857         for_each_cmsghdr(cmsg, msg) {
2858                 if (!CMSG_OK(msg, cmsg))
2859                         return -EINVAL;
2860                 if (cmsg->cmsg_level != SOL_SOCKET)
2861                         continue;
2862                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2863                 if (ret)
2864                         return ret;
2865         }
2866         return 0;
2867 }
2868 EXPORT_SYMBOL(sock_cmsg_send);
2869
2870 static void sk_enter_memory_pressure(struct sock *sk)
2871 {
2872         if (!sk->sk_prot->enter_memory_pressure)
2873                 return;
2874
2875         sk->sk_prot->enter_memory_pressure(sk);
2876 }
2877
2878 static void sk_leave_memory_pressure(struct sock *sk)
2879 {
2880         if (sk->sk_prot->leave_memory_pressure) {
2881                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2882                                      tcp_leave_memory_pressure, sk);
2883         } else {
2884                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2885
2886                 if (memory_pressure && READ_ONCE(*memory_pressure))
2887                         WRITE_ONCE(*memory_pressure, 0);
2888         }
2889 }
2890
2891 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2892
2893 /**
2894  * skb_page_frag_refill - check that a page_frag contains enough room
2895  * @sz: minimum size of the fragment we want to get
2896  * @pfrag: pointer to page_frag
2897  * @gfp: priority for memory allocation
2898  *
2899  * Note: While this allocator tries to use high order pages, there is
2900  * no guarantee that allocations succeed. Therefore, @sz MUST be
2901  * less or equal than PAGE_SIZE.
2902  */
2903 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2904 {
2905         if (pfrag->page) {
2906                 if (page_ref_count(pfrag->page) == 1) {
2907                         pfrag->offset = 0;
2908                         return true;
2909                 }
2910                 if (pfrag->offset + sz <= pfrag->size)
2911                         return true;
2912                 put_page(pfrag->page);
2913         }
2914
2915         pfrag->offset = 0;
2916         if (SKB_FRAG_PAGE_ORDER &&
2917             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2918                 /* Avoid direct reclaim but allow kswapd to wake */
2919                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2920                                           __GFP_COMP | __GFP_NOWARN |
2921                                           __GFP_NORETRY,
2922                                           SKB_FRAG_PAGE_ORDER);
2923                 if (likely(pfrag->page)) {
2924                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2925                         return true;
2926                 }
2927         }
2928         pfrag->page = alloc_page(gfp);
2929         if (likely(pfrag->page)) {
2930                 pfrag->size = PAGE_SIZE;
2931                 return true;
2932         }
2933         return false;
2934 }
2935 EXPORT_SYMBOL(skb_page_frag_refill);
2936
2937 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2938 {
2939         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2940                 return true;
2941
2942         sk_enter_memory_pressure(sk);
2943         sk_stream_moderate_sndbuf(sk);
2944         return false;
2945 }
2946 EXPORT_SYMBOL(sk_page_frag_refill);
2947
2948 void __lock_sock(struct sock *sk)
2949         __releases(&sk->sk_lock.slock)
2950         __acquires(&sk->sk_lock.slock)
2951 {
2952         DEFINE_WAIT(wait);
2953
2954         for (;;) {
2955                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2956                                         TASK_UNINTERRUPTIBLE);
2957                 spin_unlock_bh(&sk->sk_lock.slock);
2958                 schedule();
2959                 spin_lock_bh(&sk->sk_lock.slock);
2960                 if (!sock_owned_by_user(sk))
2961                         break;
2962         }
2963         finish_wait(&sk->sk_lock.wq, &wait);
2964 }
2965
2966 void __release_sock(struct sock *sk)
2967         __releases(&sk->sk_lock.slock)
2968         __acquires(&sk->sk_lock.slock)
2969 {
2970         struct sk_buff *skb, *next;
2971
2972         while ((skb = sk->sk_backlog.head) != NULL) {
2973                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2974
2975                 spin_unlock_bh(&sk->sk_lock.slock);
2976
2977                 do {
2978                         next = skb->next;
2979                         prefetch(next);
2980                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2981                         skb_mark_not_on_list(skb);
2982                         sk_backlog_rcv(sk, skb);
2983
2984                         cond_resched();
2985
2986                         skb = next;
2987                 } while (skb != NULL);
2988
2989                 spin_lock_bh(&sk->sk_lock.slock);
2990         }
2991
2992         /*
2993          * Doing the zeroing here guarantee we can not loop forever
2994          * while a wild producer attempts to flood us.
2995          */
2996         sk->sk_backlog.len = 0;
2997 }
2998
2999 void __sk_flush_backlog(struct sock *sk)
3000 {
3001         spin_lock_bh(&sk->sk_lock.slock);
3002         __release_sock(sk);
3003         spin_unlock_bh(&sk->sk_lock.slock);
3004 }
3005 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3006
3007 /**
3008  * sk_wait_data - wait for data to arrive at sk_receive_queue
3009  * @sk:    sock to wait on
3010  * @timeo: for how long
3011  * @skb:   last skb seen on sk_receive_queue
3012  *
3013  * Now socket state including sk->sk_err is changed only under lock,
3014  * hence we may omit checks after joining wait queue.
3015  * We check receive queue before schedule() only as optimization;
3016  * it is very likely that release_sock() added new data.
3017  */
3018 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3019 {
3020         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3021         int rc;
3022
3023         add_wait_queue(sk_sleep(sk), &wait);
3024         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3025         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3026         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3027         remove_wait_queue(sk_sleep(sk), &wait);
3028         return rc;
3029 }
3030 EXPORT_SYMBOL(sk_wait_data);
3031
3032 /**
3033  *      __sk_mem_raise_allocated - increase memory_allocated
3034  *      @sk: socket
3035  *      @size: memory size to allocate
3036  *      @amt: pages to allocate
3037  *      @kind: allocation type
3038  *
3039  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3040  */
3041 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3042 {
3043         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3044         struct proto *prot = sk->sk_prot;
3045         bool charged = true;
3046         long allocated;
3047
3048         sk_memory_allocated_add(sk, amt);
3049         allocated = sk_memory_allocated(sk);
3050         if (memcg_charge &&
3051             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3052                                                 gfp_memcg_charge())))
3053                 goto suppress_allocation;
3054
3055         /* Under limit. */
3056         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3057                 sk_leave_memory_pressure(sk);
3058                 return 1;
3059         }
3060
3061         /* Under pressure. */
3062         if (allocated > sk_prot_mem_limits(sk, 1))
3063                 sk_enter_memory_pressure(sk);
3064
3065         /* Over hard limit. */
3066         if (allocated > sk_prot_mem_limits(sk, 2))
3067                 goto suppress_allocation;
3068
3069         /* guarantee minimum buffer size under pressure */
3070         if (kind == SK_MEM_RECV) {
3071                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3072                         return 1;
3073
3074         } else { /* SK_MEM_SEND */
3075                 int wmem0 = sk_get_wmem0(sk, prot);
3076
3077                 if (sk->sk_type == SOCK_STREAM) {
3078                         if (sk->sk_wmem_queued < wmem0)
3079                                 return 1;
3080                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3081                                 return 1;
3082                 }
3083         }
3084
3085         if (sk_has_memory_pressure(sk)) {
3086                 u64 alloc;
3087
3088                 if (!sk_under_memory_pressure(sk))
3089                         return 1;
3090                 alloc = sk_sockets_allocated_read_positive(sk);
3091                 if (sk_prot_mem_limits(sk, 2) > alloc *
3092                     sk_mem_pages(sk->sk_wmem_queued +
3093                                  atomic_read(&sk->sk_rmem_alloc) +
3094                                  sk->sk_forward_alloc))
3095                         return 1;
3096         }
3097
3098 suppress_allocation:
3099
3100         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3101                 sk_stream_moderate_sndbuf(sk);
3102
3103                 /* Fail only if socket is _under_ its sndbuf.
3104                  * In this case we cannot block, so that we have to fail.
3105                  */
3106                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3107                         /* Force charge with __GFP_NOFAIL */
3108                         if (memcg_charge && !charged) {
3109                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3110                                         gfp_memcg_charge() | __GFP_NOFAIL);
3111                         }
3112                         return 1;
3113                 }
3114         }
3115
3116         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3117                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3118
3119         sk_memory_allocated_sub(sk, amt);
3120
3121         if (memcg_charge && charged)
3122                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3123
3124         return 0;
3125 }
3126
3127 /**
3128  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3129  *      @sk: socket
3130  *      @size: memory size to allocate
3131  *      @kind: allocation type
3132  *
3133  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3134  *      rmem allocation. This function assumes that protocols which have
3135  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3136  */
3137 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3138 {
3139         int ret, amt = sk_mem_pages(size);
3140
3141         sk->sk_forward_alloc += amt << PAGE_SHIFT;
3142         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3143         if (!ret)
3144                 sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3145         return ret;
3146 }
3147 EXPORT_SYMBOL(__sk_mem_schedule);
3148
3149 /**
3150  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3151  *      @sk: socket
3152  *      @amount: number of quanta
3153  *
3154  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3155  */
3156 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3157 {
3158         sk_memory_allocated_sub(sk, amount);
3159
3160         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3161                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3162
3163         if (sk_under_global_memory_pressure(sk) &&
3164             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3165                 sk_leave_memory_pressure(sk);
3166 }
3167
3168 /**
3169  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3170  *      @sk: socket
3171  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3172  */
3173 void __sk_mem_reclaim(struct sock *sk, int amount)
3174 {
3175         amount >>= PAGE_SHIFT;
3176         sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3177         __sk_mem_reduce_allocated(sk, amount);
3178 }
3179 EXPORT_SYMBOL(__sk_mem_reclaim);
3180
3181 int sk_set_peek_off(struct sock *sk, int val)
3182 {
3183         WRITE_ONCE(sk->sk_peek_off, val);
3184         return 0;
3185 }
3186 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3187
3188 /*
3189  * Set of default routines for initialising struct proto_ops when
3190  * the protocol does not support a particular function. In certain
3191  * cases where it makes no sense for a protocol to have a "do nothing"
3192  * function, some default processing is provided.
3193  */
3194
3195 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3196 {
3197         return -EOPNOTSUPP;
3198 }
3199 EXPORT_SYMBOL(sock_no_bind);
3200
3201 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3202                     int len, int flags)
3203 {
3204         return -EOPNOTSUPP;
3205 }
3206 EXPORT_SYMBOL(sock_no_connect);
3207
3208 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3209 {
3210         return -EOPNOTSUPP;
3211 }
3212 EXPORT_SYMBOL(sock_no_socketpair);
3213
3214 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3215                    bool kern)
3216 {
3217         return -EOPNOTSUPP;
3218 }
3219 EXPORT_SYMBOL(sock_no_accept);
3220
3221 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3222                     int peer)
3223 {
3224         return -EOPNOTSUPP;
3225 }
3226 EXPORT_SYMBOL(sock_no_getname);
3227
3228 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3229 {
3230         return -EOPNOTSUPP;
3231 }
3232 EXPORT_SYMBOL(sock_no_ioctl);
3233
3234 int sock_no_listen(struct socket *sock, int backlog)
3235 {
3236         return -EOPNOTSUPP;
3237 }
3238 EXPORT_SYMBOL(sock_no_listen);
3239
3240 int sock_no_shutdown(struct socket *sock, int how)
3241 {
3242         return -EOPNOTSUPP;
3243 }
3244 EXPORT_SYMBOL(sock_no_shutdown);
3245
3246 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3247 {
3248         return -EOPNOTSUPP;
3249 }
3250 EXPORT_SYMBOL(sock_no_sendmsg);
3251
3252 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3253 {
3254         return -EOPNOTSUPP;
3255 }
3256 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3257
3258 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3259                     int flags)
3260 {
3261         return -EOPNOTSUPP;
3262 }
3263 EXPORT_SYMBOL(sock_no_recvmsg);
3264
3265 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3266 {
3267         /* Mirror missing mmap method error code */
3268         return -ENODEV;
3269 }
3270 EXPORT_SYMBOL(sock_no_mmap);
3271
3272 /*
3273  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3274  * various sock-based usage counts.
3275  */
3276 void __receive_sock(struct file *file)
3277 {
3278         struct socket *sock;
3279
3280         sock = sock_from_file(file);
3281         if (sock) {
3282                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3283                 sock_update_classid(&sock->sk->sk_cgrp_data);
3284         }
3285 }
3286
3287 /*
3288  *      Default Socket Callbacks
3289  */
3290
3291 static void sock_def_wakeup(struct sock *sk)
3292 {
3293         struct socket_wq *wq;
3294
3295         rcu_read_lock();
3296         wq = rcu_dereference(sk->sk_wq);
3297         if (skwq_has_sleeper(wq))
3298                 wake_up_interruptible_all(&wq->wait);
3299         rcu_read_unlock();
3300 }
3301
3302 static void sock_def_error_report(struct sock *sk)
3303 {
3304         struct socket_wq *wq;
3305
3306         rcu_read_lock();
3307         wq = rcu_dereference(sk->sk_wq);
3308         if (skwq_has_sleeper(wq))
3309                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3310         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3311         rcu_read_unlock();
3312 }
3313
3314 void sock_def_readable(struct sock *sk)
3315 {
3316         struct socket_wq *wq;
3317
3318         trace_sk_data_ready(sk);
3319
3320         rcu_read_lock();
3321         wq = rcu_dereference(sk->sk_wq);
3322         if (skwq_has_sleeper(wq))
3323                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3324                                                 EPOLLRDNORM | EPOLLRDBAND);
3325         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3326         rcu_read_unlock();
3327 }
3328
3329 static void sock_def_write_space(struct sock *sk)
3330 {
3331         struct socket_wq *wq;
3332
3333         rcu_read_lock();
3334
3335         /* Do not wake up a writer until he can make "significant"
3336          * progress.  --DaveM
3337          */
3338         if (sock_writeable(sk)) {
3339                 wq = rcu_dereference(sk->sk_wq);
3340                 if (skwq_has_sleeper(wq))
3341                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3342                                                 EPOLLWRNORM | EPOLLWRBAND);
3343
3344                 /* Should agree with poll, otherwise some programs break */
3345                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3346         }
3347
3348         rcu_read_unlock();
3349 }
3350
3351 /* An optimised version of sock_def_write_space(), should only be called
3352  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3353  * ->sk_wmem_alloc.
3354  */
3355 static void sock_def_write_space_wfree(struct sock *sk)
3356 {
3357         /* Do not wake up a writer until he can make "significant"
3358          * progress.  --DaveM
3359          */
3360         if (sock_writeable(sk)) {
3361                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3362
3363                 /* rely on refcount_sub from sock_wfree() */
3364                 smp_mb__after_atomic();
3365                 if (wq && waitqueue_active(&wq->wait))
3366                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3367                                                 EPOLLWRNORM | EPOLLWRBAND);
3368
3369                 /* Should agree with poll, otherwise some programs break */
3370                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3371         }
3372 }
3373
3374 static void sock_def_destruct(struct sock *sk)
3375 {
3376 }
3377
3378 void sk_send_sigurg(struct sock *sk)
3379 {
3380         if (sk->sk_socket && sk->sk_socket->file)
3381                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3382                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3383 }
3384 EXPORT_SYMBOL(sk_send_sigurg);
3385
3386 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3387                     unsigned long expires)
3388 {
3389         if (!mod_timer(timer, expires))
3390                 sock_hold(sk);
3391 }
3392 EXPORT_SYMBOL(sk_reset_timer);
3393
3394 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3395 {
3396         if (del_timer(timer))
3397                 __sock_put(sk);
3398 }
3399 EXPORT_SYMBOL(sk_stop_timer);
3400
3401 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3402 {
3403         if (del_timer_sync(timer))
3404                 __sock_put(sk);
3405 }
3406 EXPORT_SYMBOL(sk_stop_timer_sync);
3407
3408 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3409 {
3410         sk_init_common(sk);
3411         sk->sk_send_head        =       NULL;
3412
3413         timer_setup(&sk->sk_timer, NULL, 0);
3414
3415         sk->sk_allocation       =       GFP_KERNEL;
3416         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3417         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3418         sk->sk_state            =       TCP_CLOSE;
3419         sk->sk_use_task_frag    =       true;
3420         sk_set_socket(sk, sock);
3421
3422         sock_set_flag(sk, SOCK_ZAPPED);
3423
3424         if (sock) {
3425                 sk->sk_type     =       sock->type;
3426                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3427                 sock->sk        =       sk;
3428         } else {
3429                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3430         }
3431         sk->sk_uid      =       uid;
3432
3433         rwlock_init(&sk->sk_callback_lock);
3434         if (sk->sk_kern_sock)
3435                 lockdep_set_class_and_name(
3436                         &sk->sk_callback_lock,
3437                         af_kern_callback_keys + sk->sk_family,
3438                         af_family_kern_clock_key_strings[sk->sk_family]);
3439         else
3440                 lockdep_set_class_and_name(
3441                         &sk->sk_callback_lock,
3442                         af_callback_keys + sk->sk_family,
3443                         af_family_clock_key_strings[sk->sk_family]);
3444
3445         sk->sk_state_change     =       sock_def_wakeup;
3446         sk->sk_data_ready       =       sock_def_readable;
3447         sk->sk_write_space      =       sock_def_write_space;
3448         sk->sk_error_report     =       sock_def_error_report;
3449         sk->sk_destruct         =       sock_def_destruct;
3450
3451         sk->sk_frag.page        =       NULL;
3452         sk->sk_frag.offset      =       0;
3453         sk->sk_peek_off         =       -1;
3454
3455         sk->sk_peer_pid         =       NULL;
3456         sk->sk_peer_cred        =       NULL;
3457         spin_lock_init(&sk->sk_peer_lock);
3458
3459         sk->sk_write_pending    =       0;
3460         sk->sk_rcvlowat         =       1;
3461         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3462         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3463
3464         sk->sk_stamp = SK_DEFAULT_STAMP;
3465 #if BITS_PER_LONG==32
3466         seqlock_init(&sk->sk_stamp_seq);
3467 #endif
3468         atomic_set(&sk->sk_zckey, 0);
3469
3470 #ifdef CONFIG_NET_RX_BUSY_POLL
3471         sk->sk_napi_id          =       0;
3472         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3473 #endif
3474
3475         sk->sk_max_pacing_rate = ~0UL;
3476         sk->sk_pacing_rate = ~0UL;
3477         WRITE_ONCE(sk->sk_pacing_shift, 10);
3478         sk->sk_incoming_cpu = -1;
3479
3480         sk_rx_queue_clear(sk);
3481         /*
3482          * Before updating sk_refcnt, we must commit prior changes to memory
3483          * (Documentation/RCU/rculist_nulls.rst for details)
3484          */
3485         smp_wmb();
3486         refcount_set(&sk->sk_refcnt, 1);
3487         atomic_set(&sk->sk_drops, 0);
3488 }
3489 EXPORT_SYMBOL(sock_init_data_uid);
3490
3491 void sock_init_data(struct socket *sock, struct sock *sk)
3492 {
3493         kuid_t uid = sock ?
3494                 SOCK_INODE(sock)->i_uid :
3495                 make_kuid(sock_net(sk)->user_ns, 0);
3496
3497         sock_init_data_uid(sock, sk, uid);
3498 }
3499 EXPORT_SYMBOL(sock_init_data);
3500
3501 void lock_sock_nested(struct sock *sk, int subclass)
3502 {
3503         /* The sk_lock has mutex_lock() semantics here. */
3504         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3505
3506         might_sleep();
3507         spin_lock_bh(&sk->sk_lock.slock);
3508         if (sock_owned_by_user_nocheck(sk))
3509                 __lock_sock(sk);
3510         sk->sk_lock.owned = 1;
3511         spin_unlock_bh(&sk->sk_lock.slock);
3512 }
3513 EXPORT_SYMBOL(lock_sock_nested);
3514
3515 void release_sock(struct sock *sk)
3516 {
3517         spin_lock_bh(&sk->sk_lock.slock);
3518         if (sk->sk_backlog.tail)
3519                 __release_sock(sk);
3520
3521         /* Warning : release_cb() might need to release sk ownership,
3522          * ie call sock_release_ownership(sk) before us.
3523          */
3524         if (sk->sk_prot->release_cb)
3525                 sk->sk_prot->release_cb(sk);
3526
3527         sock_release_ownership(sk);
3528         if (waitqueue_active(&sk->sk_lock.wq))
3529                 wake_up(&sk->sk_lock.wq);
3530         spin_unlock_bh(&sk->sk_lock.slock);
3531 }
3532 EXPORT_SYMBOL(release_sock);
3533
3534 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3535 {
3536         might_sleep();
3537         spin_lock_bh(&sk->sk_lock.slock);
3538
3539         if (!sock_owned_by_user_nocheck(sk)) {
3540                 /*
3541                  * Fast path return with bottom halves disabled and
3542                  * sock::sk_lock.slock held.
3543                  *
3544                  * The 'mutex' is not contended and holding
3545                  * sock::sk_lock.slock prevents all other lockers to
3546                  * proceed so the corresponding unlock_sock_fast() can
3547                  * avoid the slow path of release_sock() completely and
3548                  * just release slock.
3549                  *
3550                  * From a semantical POV this is equivalent to 'acquiring'
3551                  * the 'mutex', hence the corresponding lockdep
3552                  * mutex_release() has to happen in the fast path of
3553                  * unlock_sock_fast().
3554                  */
3555                 return false;
3556         }
3557
3558         __lock_sock(sk);
3559         sk->sk_lock.owned = 1;
3560         __acquire(&sk->sk_lock.slock);
3561         spin_unlock_bh(&sk->sk_lock.slock);
3562         return true;
3563 }
3564 EXPORT_SYMBOL(__lock_sock_fast);
3565
3566 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3567                    bool timeval, bool time32)
3568 {
3569         struct sock *sk = sock->sk;
3570         struct timespec64 ts;
3571
3572         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3573         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3574         if (ts.tv_sec == -1)
3575                 return -ENOENT;
3576         if (ts.tv_sec == 0) {
3577                 ktime_t kt = ktime_get_real();
3578                 sock_write_timestamp(sk, kt);
3579                 ts = ktime_to_timespec64(kt);
3580         }
3581
3582         if (timeval)
3583                 ts.tv_nsec /= 1000;
3584
3585 #ifdef CONFIG_COMPAT_32BIT_TIME
3586         if (time32)
3587                 return put_old_timespec32(&ts, userstamp);
3588 #endif
3589 #ifdef CONFIG_SPARC64
3590         /* beware of padding in sparc64 timeval */
3591         if (timeval && !in_compat_syscall()) {
3592                 struct __kernel_old_timeval __user tv = {
3593                         .tv_sec = ts.tv_sec,
3594                         .tv_usec = ts.tv_nsec,
3595                 };
3596                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3597                         return -EFAULT;
3598                 return 0;
3599         }
3600 #endif
3601         return put_timespec64(&ts, userstamp);
3602 }
3603 EXPORT_SYMBOL(sock_gettstamp);
3604
3605 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3606 {
3607         if (!sock_flag(sk, flag)) {
3608                 unsigned long previous_flags = sk->sk_flags;
3609
3610                 sock_set_flag(sk, flag);
3611                 /*
3612                  * we just set one of the two flags which require net
3613                  * time stamping, but time stamping might have been on
3614                  * already because of the other one
3615                  */
3616                 if (sock_needs_netstamp(sk) &&
3617                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3618                         net_enable_timestamp();
3619         }
3620 }
3621
3622 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3623                        int level, int type)
3624 {
3625         struct sock_exterr_skb *serr;
3626         struct sk_buff *skb;
3627         int copied, err;
3628
3629         err = -EAGAIN;
3630         skb = sock_dequeue_err_skb(sk);
3631         if (skb == NULL)
3632                 goto out;
3633
3634         copied = skb->len;
3635         if (copied > len) {
3636                 msg->msg_flags |= MSG_TRUNC;
3637                 copied = len;
3638         }
3639         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3640         if (err)
3641                 goto out_free_skb;
3642
3643         sock_recv_timestamp(msg, sk, skb);
3644
3645         serr = SKB_EXT_ERR(skb);
3646         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3647
3648         msg->msg_flags |= MSG_ERRQUEUE;
3649         err = copied;
3650
3651 out_free_skb:
3652         kfree_skb(skb);
3653 out:
3654         return err;
3655 }
3656 EXPORT_SYMBOL(sock_recv_errqueue);
3657
3658 /*
3659  *      Get a socket option on an socket.
3660  *
3661  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3662  *      asynchronous errors should be reported by getsockopt. We assume
3663  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3664  */
3665 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3666                            char __user *optval, int __user *optlen)
3667 {
3668         struct sock *sk = sock->sk;
3669
3670         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3671         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3672 }
3673 EXPORT_SYMBOL(sock_common_getsockopt);
3674
3675 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3676                         int flags)
3677 {
3678         struct sock *sk = sock->sk;
3679         int addr_len = 0;
3680         int err;
3681
3682         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3683         if (err >= 0)
3684                 msg->msg_namelen = addr_len;
3685         return err;
3686 }
3687 EXPORT_SYMBOL(sock_common_recvmsg);
3688
3689 /*
3690  *      Set socket options on an inet socket.
3691  */
3692 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3693                            sockptr_t optval, unsigned int optlen)
3694 {
3695         struct sock *sk = sock->sk;
3696
3697         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3698         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3699 }
3700 EXPORT_SYMBOL(sock_common_setsockopt);
3701
3702 void sk_common_release(struct sock *sk)
3703 {
3704         if (sk->sk_prot->destroy)
3705                 sk->sk_prot->destroy(sk);
3706
3707         /*
3708          * Observation: when sk_common_release is called, processes have
3709          * no access to socket. But net still has.
3710          * Step one, detach it from networking:
3711          *
3712          * A. Remove from hash tables.
3713          */
3714
3715         sk->sk_prot->unhash(sk);
3716
3717         /*
3718          * In this point socket cannot receive new packets, but it is possible
3719          * that some packets are in flight because some CPU runs receiver and
3720          * did hash table lookup before we unhashed socket. They will achieve
3721          * receive queue and will be purged by socket destructor.
3722          *
3723          * Also we still have packets pending on receive queue and probably,
3724          * our own packets waiting in device queues. sock_destroy will drain
3725          * receive queue, but transmitted packets will delay socket destruction
3726          * until the last reference will be released.
3727          */
3728
3729         sock_orphan(sk);
3730
3731         xfrm_sk_free_policy(sk);
3732
3733         sock_put(sk);
3734 }
3735 EXPORT_SYMBOL(sk_common_release);
3736
3737 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3738 {
3739         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3740
3741         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3742         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3743         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3744         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3745         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3746         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3747         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3748         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3749         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3750 }
3751
3752 #ifdef CONFIG_PROC_FS
3753 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3754
3755 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3756 {
3757         int cpu, idx = prot->inuse_idx;
3758         int res = 0;
3759
3760         for_each_possible_cpu(cpu)
3761                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3762
3763         return res >= 0 ? res : 0;
3764 }
3765 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3766
3767 int sock_inuse_get(struct net *net)
3768 {
3769         int cpu, res = 0;
3770
3771         for_each_possible_cpu(cpu)
3772                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3773
3774         return res;
3775 }
3776
3777 EXPORT_SYMBOL_GPL(sock_inuse_get);
3778
3779 static int __net_init sock_inuse_init_net(struct net *net)
3780 {
3781         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3782         if (net->core.prot_inuse == NULL)
3783                 return -ENOMEM;
3784         return 0;
3785 }
3786
3787 static void __net_exit sock_inuse_exit_net(struct net *net)
3788 {
3789         free_percpu(net->core.prot_inuse);
3790 }
3791
3792 static struct pernet_operations net_inuse_ops = {
3793         .init = sock_inuse_init_net,
3794         .exit = sock_inuse_exit_net,
3795 };
3796
3797 static __init int net_inuse_init(void)
3798 {
3799         if (register_pernet_subsys(&net_inuse_ops))
3800                 panic("Cannot initialize net inuse counters");
3801
3802         return 0;
3803 }
3804
3805 core_initcall(net_inuse_init);
3806
3807 static int assign_proto_idx(struct proto *prot)
3808 {
3809         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3810
3811         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3812                 pr_err("PROTO_INUSE_NR exhausted\n");
3813                 return -ENOSPC;
3814         }
3815
3816         set_bit(prot->inuse_idx, proto_inuse_idx);
3817         return 0;
3818 }
3819
3820 static void release_proto_idx(struct proto *prot)
3821 {
3822         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3823                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3824 }
3825 #else
3826 static inline int assign_proto_idx(struct proto *prot)
3827 {
3828         return 0;
3829 }
3830
3831 static inline void release_proto_idx(struct proto *prot)
3832 {
3833 }
3834
3835 #endif
3836
3837 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3838 {
3839         if (!twsk_prot)
3840                 return;
3841         kfree(twsk_prot->twsk_slab_name);
3842         twsk_prot->twsk_slab_name = NULL;
3843         kmem_cache_destroy(twsk_prot->twsk_slab);
3844         twsk_prot->twsk_slab = NULL;
3845 }
3846
3847 static int tw_prot_init(const struct proto *prot)
3848 {
3849         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3850
3851         if (!twsk_prot)
3852                 return 0;
3853
3854         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3855                                               prot->name);
3856         if (!twsk_prot->twsk_slab_name)
3857                 return -ENOMEM;
3858
3859         twsk_prot->twsk_slab =
3860                 kmem_cache_create(twsk_prot->twsk_slab_name,
3861                                   twsk_prot->twsk_obj_size, 0,
3862                                   SLAB_ACCOUNT | prot->slab_flags,
3863                                   NULL);
3864         if (!twsk_prot->twsk_slab) {
3865                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3866                         prot->name);
3867                 return -ENOMEM;
3868         }
3869
3870         return 0;
3871 }
3872
3873 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3874 {
3875         if (!rsk_prot)
3876                 return;
3877         kfree(rsk_prot->slab_name);
3878         rsk_prot->slab_name = NULL;
3879         kmem_cache_destroy(rsk_prot->slab);
3880         rsk_prot->slab = NULL;
3881 }
3882
3883 static int req_prot_init(const struct proto *prot)
3884 {
3885         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3886
3887         if (!rsk_prot)
3888                 return 0;
3889
3890         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3891                                         prot->name);
3892         if (!rsk_prot->slab_name)
3893                 return -ENOMEM;
3894
3895         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3896                                            rsk_prot->obj_size, 0,
3897                                            SLAB_ACCOUNT | prot->slab_flags,
3898                                            NULL);
3899
3900         if (!rsk_prot->slab) {
3901                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3902                         prot->name);
3903                 return -ENOMEM;
3904         }
3905         return 0;
3906 }
3907
3908 int proto_register(struct proto *prot, int alloc_slab)
3909 {
3910         int ret = -ENOBUFS;
3911
3912         if (prot->memory_allocated && !prot->sysctl_mem) {
3913                 pr_err("%s: missing sysctl_mem\n", prot->name);
3914                 return -EINVAL;
3915         }
3916         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3917                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3918                 return -EINVAL;
3919         }
3920         if (alloc_slab) {
3921                 prot->slab = kmem_cache_create_usercopy(prot->name,
3922                                         prot->obj_size, 0,
3923                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3924                                         prot->slab_flags,
3925                                         prot->useroffset, prot->usersize,
3926                                         NULL);
3927
3928                 if (prot->slab == NULL) {
3929                         pr_crit("%s: Can't create sock SLAB cache!\n",
3930                                 prot->name);
3931                         goto out;
3932                 }
3933
3934                 if (req_prot_init(prot))
3935                         goto out_free_request_sock_slab;
3936
3937                 if (tw_prot_init(prot))
3938                         goto out_free_timewait_sock_slab;
3939         }
3940
3941         mutex_lock(&proto_list_mutex);
3942         ret = assign_proto_idx(prot);
3943         if (ret) {
3944                 mutex_unlock(&proto_list_mutex);
3945                 goto out_free_timewait_sock_slab;
3946         }
3947         list_add(&prot->node, &proto_list);
3948         mutex_unlock(&proto_list_mutex);
3949         return ret;
3950
3951 out_free_timewait_sock_slab:
3952         if (alloc_slab)
3953                 tw_prot_cleanup(prot->twsk_prot);
3954 out_free_request_sock_slab:
3955         if (alloc_slab) {
3956                 req_prot_cleanup(prot->rsk_prot);
3957
3958                 kmem_cache_destroy(prot->slab);
3959                 prot->slab = NULL;
3960         }
3961 out:
3962         return ret;
3963 }
3964 EXPORT_SYMBOL(proto_register);
3965
3966 void proto_unregister(struct proto *prot)
3967 {
3968         mutex_lock(&proto_list_mutex);
3969         release_proto_idx(prot);
3970         list_del(&prot->node);
3971         mutex_unlock(&proto_list_mutex);
3972
3973         kmem_cache_destroy(prot->slab);
3974         prot->slab = NULL;
3975
3976         req_prot_cleanup(prot->rsk_prot);
3977         tw_prot_cleanup(prot->twsk_prot);
3978 }
3979 EXPORT_SYMBOL(proto_unregister);
3980
3981 int sock_load_diag_module(int family, int protocol)
3982 {
3983         if (!protocol) {
3984                 if (!sock_is_registered(family))
3985                         return -ENOENT;
3986
3987                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3988                                       NETLINK_SOCK_DIAG, family);
3989         }
3990
3991 #ifdef CONFIG_INET
3992         if (family == AF_INET &&
3993             protocol != IPPROTO_RAW &&
3994             protocol < MAX_INET_PROTOS &&
3995             !rcu_access_pointer(inet_protos[protocol]))
3996                 return -ENOENT;
3997 #endif
3998
3999         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4000                               NETLINK_SOCK_DIAG, family, protocol);
4001 }
4002 EXPORT_SYMBOL(sock_load_diag_module);
4003
4004 #ifdef CONFIG_PROC_FS
4005 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4006         __acquires(proto_list_mutex)
4007 {
4008         mutex_lock(&proto_list_mutex);
4009         return seq_list_start_head(&proto_list, *pos);
4010 }
4011
4012 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4013 {
4014         return seq_list_next(v, &proto_list, pos);
4015 }
4016
4017 static void proto_seq_stop(struct seq_file *seq, void *v)
4018         __releases(proto_list_mutex)
4019 {
4020         mutex_unlock(&proto_list_mutex);
4021 }
4022
4023 static char proto_method_implemented(const void *method)
4024 {
4025         return method == NULL ? 'n' : 'y';
4026 }
4027 static long sock_prot_memory_allocated(struct proto *proto)
4028 {
4029         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4030 }
4031
4032 static const char *sock_prot_memory_pressure(struct proto *proto)
4033 {
4034         return proto->memory_pressure != NULL ?
4035         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4036 }
4037
4038 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4039 {
4040
4041         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4042                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4043                    proto->name,
4044                    proto->obj_size,
4045                    sock_prot_inuse_get(seq_file_net(seq), proto),
4046                    sock_prot_memory_allocated(proto),
4047                    sock_prot_memory_pressure(proto),
4048                    proto->max_header,
4049                    proto->slab == NULL ? "no" : "yes",
4050                    module_name(proto->owner),
4051                    proto_method_implemented(proto->close),
4052                    proto_method_implemented(proto->connect),
4053                    proto_method_implemented(proto->disconnect),
4054                    proto_method_implemented(proto->accept),
4055                    proto_method_implemented(proto->ioctl),
4056                    proto_method_implemented(proto->init),
4057                    proto_method_implemented(proto->destroy),
4058                    proto_method_implemented(proto->shutdown),
4059                    proto_method_implemented(proto->setsockopt),
4060                    proto_method_implemented(proto->getsockopt),
4061                    proto_method_implemented(proto->sendmsg),
4062                    proto_method_implemented(proto->recvmsg),
4063                    proto_method_implemented(proto->bind),
4064                    proto_method_implemented(proto->backlog_rcv),
4065                    proto_method_implemented(proto->hash),
4066                    proto_method_implemented(proto->unhash),
4067                    proto_method_implemented(proto->get_port),
4068                    proto_method_implemented(proto->enter_memory_pressure));
4069 }
4070
4071 static int proto_seq_show(struct seq_file *seq, void *v)
4072 {
4073         if (v == &proto_list)
4074                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4075                            "protocol",
4076                            "size",
4077                            "sockets",
4078                            "memory",
4079                            "press",
4080                            "maxhdr",
4081                            "slab",
4082                            "module",
4083                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4084         else
4085                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4086         return 0;
4087 }
4088
4089 static const struct seq_operations proto_seq_ops = {
4090         .start  = proto_seq_start,
4091         .next   = proto_seq_next,
4092         .stop   = proto_seq_stop,
4093         .show   = proto_seq_show,
4094 };
4095
4096 static __net_init int proto_init_net(struct net *net)
4097 {
4098         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4099                         sizeof(struct seq_net_private)))
4100                 return -ENOMEM;
4101
4102         return 0;
4103 }
4104
4105 static __net_exit void proto_exit_net(struct net *net)
4106 {
4107         remove_proc_entry("protocols", net->proc_net);
4108 }
4109
4110
4111 static __net_initdata struct pernet_operations proto_net_ops = {
4112         .init = proto_init_net,
4113         .exit = proto_exit_net,
4114 };
4115
4116 static int __init proto_init(void)
4117 {
4118         return register_pernet_subsys(&proto_net_ops);
4119 }
4120
4121 subsys_initcall(proto_init);
4122
4123 #endif /* PROC_FS */
4124
4125 #ifdef CONFIG_NET_RX_BUSY_POLL
4126 bool sk_busy_loop_end(void *p, unsigned long start_time)
4127 {
4128         struct sock *sk = p;
4129
4130         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4131                sk_busy_loop_timeout(sk, start_time);
4132 }
4133 EXPORT_SYMBOL(sk_busy_loop_end);
4134 #endif /* CONFIG_NET_RX_BUSY_POLL */
4135
4136 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4137 {
4138         if (!sk->sk_prot->bind_add)
4139                 return -EOPNOTSUPP;
4140         return sk->sk_prot->bind_add(sk, addr, addr_len);
4141 }
4142 EXPORT_SYMBOL(sock_bind_add);
4143
4144 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4145 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4146                      void __user *arg, void *karg, size_t size)
4147 {
4148         int ret;
4149
4150         if (copy_from_user(karg, arg, size))
4151                 return -EFAULT;
4152
4153         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4154         if (ret)
4155                 return ret;
4156
4157         if (copy_to_user(arg, karg, size))
4158                 return -EFAULT;
4159
4160         return 0;
4161 }
4162 EXPORT_SYMBOL(sock_ioctl_inout);
4163
4164 /* This is the most common ioctl prep function, where the result (4 bytes) is
4165  * copied back to userspace if the ioctl() returns successfully. No input is
4166  * copied from userspace as input argument.
4167  */
4168 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4169 {
4170         int ret, karg = 0;
4171
4172         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4173         if (ret)
4174                 return ret;
4175
4176         return put_user(karg, (int __user *)arg);
4177 }
4178
4179 /* A wrapper around sock ioctls, which copies the data from userspace
4180  * (depending on the protocol/ioctl), and copies back the result to userspace.
4181  * The main motivation for this function is to pass kernel memory to the
4182  * protocol ioctl callbacks, instead of userspace memory.
4183  */
4184 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4185 {
4186         int rc = 1;
4187
4188         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4189                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4190         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4191                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4192         else if (sk_is_phonet(sk))
4193                 rc = phonet_sk_ioctl(sk, cmd, arg);
4194
4195         /* If ioctl was processed, returns its value */
4196         if (rc <= 0)
4197                 return rc;
4198
4199         /* Otherwise call the default handler */
4200         return sock_ioctl_out(sk, cmd, arg);
4201 }
4202 EXPORT_SYMBOL(sk_ioctl);