net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117
 118 #include <linux/uaccess.h>
 119
 120 #include <linux/netdevice.h>
 121 #include <net/protocol.h>
 122 #include <linux/skbuff.h>
 123 #include <net/net_namespace.h>
 124 #include <net/request_sock.h>
 125 #include <net/sock.h>
 126 #include <linux/net_tstamp.h>
 127 #include <net/xfrm.h>
 128 #include <linux/ipsec.h>
 129 #include <net/cls_cgroup.h>
 130 #include <net/netprio_cgroup.h>
 131 #include <linux/sock_diag.h>
 132
 133 #include <linux/filter.h>
 134 #include <net/sock_reuseport.h>
 135 #include <net/bpf_sk_storage.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #include <net/tcp.h>
 140 #include <net/busy_poll.h>
 141
 142 static DEFINE_MUTEX(proto_list_mutex);
 143 static LIST_HEAD(proto_list);
 144
 145 static void sock_inuse_add(struct net *net, int val);
 146
 147 /**
 148  * sk_ns_capable - General socket capability test
 149  * @sk: Socket to use a capability on or through
 150  * @user_ns: The user namespace of the capability to use
 151  * @cap: The capability to use
 152  *
 153  * Test to see if the opener of the socket had when the socket was
 154  * created and the current process has the capability @cap in the user
 155  * namespace @user_ns.
 156  */
 157 bool sk_ns_capable(const struct sock *sk,
 158                    struct user_namespace *user_ns, int cap)
 159 {
 160         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161                 ns_capable(user_ns, cap);
 162 }
 163 EXPORT_SYMBOL(sk_ns_capable);
 164
 165 /**
 166  * sk_capable - Socket global capability test
 167  * @sk: Socket to use a capability on or through
 168  * @cap: The global capability to use
 169  *
 170  * Test to see if the opener of the socket had when the socket was
 171  * created and the current process has the capability @cap in all user
 172  * namespaces.
 173  */
 174 bool sk_capable(const struct sock *sk, int cap)
 175 {
 176         return sk_ns_capable(sk, &init_user_ns, cap);
 177 }
 178 EXPORT_SYMBOL(sk_capable);
 179
 180 /**
 181  * sk_net_capable - Network namespace socket capability test
 182  * @sk: Socket to use a capability on or through
 183  * @cap: The capability to use
 184  *
 185  * Test to see if the opener of the socket had when the socket was created
 186  * and the current process has the capability @cap over the network namespace
 187  * the socket is a member of.
 188  */
 189 bool sk_net_capable(const struct sock *sk, int cap)
 190 {
 191         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192 }
 193 EXPORT_SYMBOL(sk_net_capable);
 194
 195 /*
 196  * Each address family might have different locking rules, so we have
 197  * one slock key per address family and separate keys for internal and
 198  * userspace sockets.
 199  */
 200 static struct lock_class_key af_family_keys[AF_MAX];
 201 static struct lock_class_key af_family_kern_keys[AF_MAX];
 202 static struct lock_class_key af_family_slock_keys[AF_MAX];
 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205 /*
 206  * Make lock validator output more readable. (we pre-construct these
 207  * strings build-time, so that runtime initialization of socket
 208  * locks is fast):
 209  */
 210
 211 #define _sock_locks(x)                                            \
 212   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 213   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 214   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 215   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 216   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 217   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 218   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 219   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 220   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 221   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 222   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 223   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 224   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 225   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 226   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 227   x "AF_MAX"
 228
 229 static const char *const af_family_key_strings[AF_MAX+1] = {
 230         _sock_locks("sk_lock-")
 231 };
 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233         _sock_locks("slock-")
 234 };
 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236         _sock_locks("clock-")
 237 };
 238
 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240         _sock_locks("k-sk_lock-")
 241 };
 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-slock-")
 244 };
 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-clock-")
 247 };
 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249         _sock_locks("rlock-")
 250 };
 251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 252         _sock_locks("wlock-")
 253 };
 254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 255         _sock_locks("elock-")
 256 };
 257
 258 /*
 259  * sk_callback_lock and sk queues locking rules are per-address-family,
 260  * so split the lock classes by using a per-AF key:
 261  */
 262 static struct lock_class_key af_callback_keys[AF_MAX];
 263 static struct lock_class_key af_rlock_keys[AF_MAX];
 264 static struct lock_class_key af_wlock_keys[AF_MAX];
 265 static struct lock_class_key af_elock_keys[AF_MAX];
 266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 267
 268 /* Run time adjustable parameters. */
 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 270 EXPORT_SYMBOL(sysctl_wmem_max);
 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 272 EXPORT_SYMBOL(sysctl_rmem_max);
 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 275
 276 /* Maximal space eaten by iovec or ancillary data plus some space */
 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 278 EXPORT_SYMBOL(sysctl_optmem_max);
 279
 280 int sysctl_tstamp_allow_data __read_mostly = 1;
 281
 282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 284
 285 /**
 286  * sk_set_memalloc - sets %SOCK_MEMALLOC
 287  * @sk: socket to set it on
 288  *
 289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 290  * It's the responsibility of the admin to adjust min_free_kbytes
 291  * to meet the requirements
 292  */
 293 void sk_set_memalloc(struct sock *sk)
 294 {
 295         sock_set_flag(sk, SOCK_MEMALLOC);
 296         sk->sk_allocation |= __GFP_MEMALLOC;
 297         static_branch_inc(&memalloc_socks_key);
 298 }
 299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 300
 301 void sk_clear_memalloc(struct sock *sk)
 302 {
 303         sock_reset_flag(sk, SOCK_MEMALLOC);
 304         sk->sk_allocation &= ~__GFP_MEMALLOC;
 305         static_branch_dec(&memalloc_socks_key);
 306
 307         /*
 308          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 309          * progress of swapping. SOCK_MEMALLOC may be cleared while
 310          * it has rmem allocations due to the last swapfile being deactivated
 311          * but there is a risk that the socket is unusable due to exceeding
 312          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 313          */
 314         sk_mem_reclaim(sk);
 315 }
 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319 {
 320         int ret;
 321         unsigned int noreclaim_flag;
 322
 323         /* these should have been dropped before queueing */
 324         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326         noreclaim_flag = memalloc_noreclaim_save();
 327         ret = sk->sk_backlog_rcv(sk, skb);
 328         memalloc_noreclaim_restore(noreclaim_flag);
 329
 330         return ret;
 331 }
 332 EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 335 {
 336         struct __kernel_sock_timeval tv;
 337
 338         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 339                 tv.tv_sec = 0;
 340                 tv.tv_usec = 0;
 341         } else {
 342                 tv.tv_sec = timeo / HZ;
 343                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 344         }
 345
 346         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 347                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 348                 *(struct old_timeval32 *)optval = tv32;
 349                 return sizeof(tv32);
 350         }
 351
 352         if (old_timeval) {
 353                 struct __kernel_old_timeval old_tv;
 354                 old_tv.tv_sec = tv.tv_sec;
 355                 old_tv.tv_usec = tv.tv_usec;
 356                 *(struct __kernel_old_timeval *)optval = old_tv;
 357                 return sizeof(old_tv);
 358         }
 359
 360         *(struct __kernel_sock_timeval *)optval = tv;
 361         return sizeof(tv);
 362 }
 363
 364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 365                             bool old_timeval)
 366 {
 367         struct __kernel_sock_timeval tv;
 368
 369         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 370                 struct old_timeval32 tv32;
 371
 372                 if (optlen < sizeof(tv32))
 373                         return -EINVAL;
 374
 375                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 376                         return -EFAULT;
 377                 tv.tv_sec = tv32.tv_sec;
 378                 tv.tv_usec = tv32.tv_usec;
 379         } else if (old_timeval) {
 380                 struct __kernel_old_timeval old_tv;
 381
 382                 if (optlen < sizeof(old_tv))
 383                         return -EINVAL;
 384                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 385                         return -EFAULT;
 386                 tv.tv_sec = old_tv.tv_sec;
 387                 tv.tv_usec = old_tv.tv_usec;
 388         } else {
 389                 if (optlen < sizeof(tv))
 390                         return -EINVAL;
 391                 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 392                         return -EFAULT;
 393         }
 394         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                 return -EDOM;
 396
 397         if (tv.tv_sec < 0) {
 398                 static int warned __read_mostly;
 399
 400                 *timeo_p = 0;
 401                 if (warned < 10 && net_ratelimit()) {
 402                         warned++;
 403                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                 __func__, current->comm, task_pid_nr(current));
 405                 }
 406                 return 0;
 407         }
 408         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                 return 0;
 411         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 412                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 413         return 0;
 414 }
 415
 416 static bool sock_needs_netstamp(const struct sock *sk)
 417 {
 418         switch (sk->sk_family) {
 419         case AF_UNSPEC:
 420         case AF_UNIX:
 421                 return false;
 422         default:
 423                 return true;
 424         }
 425 }
 426
 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 428 {
 429         if (sk->sk_flags & flags) {
 430                 sk->sk_flags &= ~flags;
 431                 if (sock_needs_netstamp(sk) &&
 432                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 433                         net_disable_timestamp();
 434         }
 435 }
 436
 437
 438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 439 {
 440         unsigned long flags;
 441         struct sk_buff_head *list = &sk->sk_receive_queue;
 442
 443         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 444                 atomic_inc(&sk->sk_drops);
 445                 trace_sock_rcvqueue_full(sk, skb);
 446                 return -ENOMEM;
 447         }
 448
 449         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 450                 atomic_inc(&sk->sk_drops);
 451                 return -ENOBUFS;
 452         }
 453
 454         skb->dev = NULL;
 455         skb_set_owner_r(skb, sk);
 456
 457         /* we escape from rcu protected region, make sure we dont leak
 458          * a norefcounted dst
 459          */
 460         skb_dst_force(skb);
 461
 462         spin_lock_irqsave(&list->lock, flags);
 463         sock_skb_set_dropcount(sk, skb);
 464         __skb_queue_tail(list, skb);
 465         spin_unlock_irqrestore(&list->lock, flags);
 466
 467         if (!sock_flag(sk, SOCK_DEAD))
 468                 sk->sk_data_ready(sk);
 469         return 0;
 470 }
 471 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 472
 473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 474 {
 475         int err;
 476
 477         err = sk_filter(sk, skb);
 478         if (err)
 479                 return err;
 480
 481         return __sock_queue_rcv_skb(sk, skb);
 482 }
 483 EXPORT_SYMBOL(sock_queue_rcv_skb);
 484
 485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 486                      const int nested, unsigned int trim_cap, bool refcounted)
 487 {
 488         int rc = NET_RX_SUCCESS;
 489
 490         if (sk_filter_trim_cap(sk, skb, trim_cap))
 491                 goto discard_and_relse;
 492
 493         skb->dev = NULL;
 494
 495         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 496                 atomic_inc(&sk->sk_drops);
 497                 goto discard_and_relse;
 498         }
 499         if (nested)
 500                 bh_lock_sock_nested(sk);
 501         else
 502                 bh_lock_sock(sk);
 503         if (!sock_owned_by_user(sk)) {
 504                 /*
 505                  * trylock + unlock semantics:
 506                  */
 507                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 508
 509                 rc = sk_backlog_rcv(sk, skb);
 510
 511                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 512         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 513                 bh_unlock_sock(sk);
 514                 atomic_inc(&sk->sk_drops);
 515                 goto discard_and_relse;
 516         }
 517
 518         bh_unlock_sock(sk);
 519 out:
 520         if (refcounted)
 521                 sock_put(sk);
 522         return rc;
 523 discard_and_relse:
 524         kfree_skb(skb);
 525         goto out;
 526 }
 527 EXPORT_SYMBOL(__sk_receive_skb);
 528
 529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 530 {
 531         struct dst_entry *dst = __sk_dst_get(sk);
 532
 533         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 534                 sk_tx_queue_clear(sk);
 535                 sk->sk_dst_pending_confirm = 0;
 536                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 537                 dst_release(dst);
 538                 return NULL;
 539         }
 540
 541         return dst;
 542 }
 543 EXPORT_SYMBOL(__sk_dst_check);
 544
 545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 546 {
 547         struct dst_entry *dst = sk_dst_get(sk);
 548
 549         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 550                 sk_dst_reset(sk);
 551                 dst_release(dst);
 552                 return NULL;
 553         }
 554
 555         return dst;
 556 }
 557 EXPORT_SYMBOL(sk_dst_check);
 558
 559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 560 {
 561         int ret = -ENOPROTOOPT;
 562 #ifdef CONFIG_NETDEVICES
 563         struct net *net = sock_net(sk);
 564
 565         /* Sorry... */
 566         ret = -EPERM;
 567         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 568                 goto out;
 569
 570         ret = -EINVAL;
 571         if (ifindex < 0)
 572                 goto out;
 573
 574         sk->sk_bound_dev_if = ifindex;
 575         if (sk->sk_prot->rehash)
 576                 sk->sk_prot->rehash(sk);
 577         sk_dst_reset(sk);
 578
 579         ret = 0;
 580
 581 out:
 582 #endif
 583
 584         return ret;
 585 }
 586
 587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 588 {
 589         int ret;
 590
 591         if (lock_sk)
 592                 lock_sock(sk);
 593         ret = sock_bindtoindex_locked(sk, ifindex);
 594         if (lock_sk)
 595                 release_sock(sk);
 596
 597         return ret;
 598 }
 599 EXPORT_SYMBOL(sock_bindtoindex);
 600
 601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 602 {
 603         int ret = -ENOPROTOOPT;
 604 #ifdef CONFIG_NETDEVICES
 605         struct net *net = sock_net(sk);
 606         char devname[IFNAMSIZ];
 607         int index;
 608
 609         ret = -EINVAL;
 610         if (optlen < 0)
 611                 goto out;
 612
 613         /* Bind this socket to a particular device like "eth0",
 614          * as specified in the passed interface name. If the
 615          * name is "" or the option length is zero the socket
 616          * is not bound.
 617          */
 618         if (optlen > IFNAMSIZ - 1)
 619                 optlen = IFNAMSIZ - 1;
 620         memset(devname, 0, sizeof(devname));
 621
 622         ret = -EFAULT;
 623         if (copy_from_sockptr(devname, optval, optlen))
 624                 goto out;
 625
 626         index = 0;
 627         if (devname[0] != '\0') {
 628                 struct net_device *dev;
 629
 630                 rcu_read_lock();
 631                 dev = dev_get_by_name_rcu(net, devname);
 632                 if (dev)
 633                         index = dev->ifindex;
 634                 rcu_read_unlock();
 635                 ret = -ENODEV;
 636                 if (!dev)
 637                         goto out;
 638         }
 639
 640         return sock_bindtoindex(sk, index, true);
 641 out:
 642 #endif
 643
 644         return ret;
 645 }
 646
 647 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 648                                 int __user *optlen, int len)
 649 {
 650         int ret = -ENOPROTOOPT;
 651 #ifdef CONFIG_NETDEVICES
 652         struct net *net = sock_net(sk);
 653         char devname[IFNAMSIZ];
 654
 655         if (sk->sk_bound_dev_if == 0) {
 656                 len = 0;
 657                 goto zero;
 658         }
 659
 660         ret = -EINVAL;
 661         if (len < IFNAMSIZ)
 662                 goto out;
 663
 664         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 665         if (ret)
 666                 goto out;
 667
 668         len = strlen(devname) + 1;
 669
 670         ret = -EFAULT;
 671         if (copy_to_user(optval, devname, len))
 672                 goto out;
 673
 674 zero:
 675         ret = -EFAULT;
 676         if (put_user(len, optlen))
 677                 goto out;
 678
 679         ret = 0;
 680
 681 out:
 682 #endif
 683
 684         return ret;
 685 }
 686
 687 bool sk_mc_loop(struct sock *sk)
 688 {
 689         if (dev_recursion_level())
 690                 return false;
 691         if (!sk)
 692                 return true;
 693         switch (sk->sk_family) {
 694         case AF_INET:
 695                 return inet_sk(sk)->mc_loop;
 696 #if IS_ENABLED(CONFIG_IPV6)
 697         case AF_INET6:
 698                 return inet6_sk(sk)->mc_loop;
 699 #endif
 700         }
 701         WARN_ON_ONCE(1);
 702         return true;
 703 }
 704 EXPORT_SYMBOL(sk_mc_loop);
 705
 706 void sock_set_reuseaddr(struct sock *sk)
 707 {
 708         lock_sock(sk);
 709         sk->sk_reuse = SK_CAN_REUSE;
 710         release_sock(sk);
 711 }
 712 EXPORT_SYMBOL(sock_set_reuseaddr);
 713
 714 void sock_set_reuseport(struct sock *sk)
 715 {
 716         lock_sock(sk);
 717         sk->sk_reuseport = true;
 718         release_sock(sk);
 719 }
 720 EXPORT_SYMBOL(sock_set_reuseport);
 721
 722 void sock_no_linger(struct sock *sk)
 723 {
 724         lock_sock(sk);
 725         sk->sk_lingertime = 0;
 726         sock_set_flag(sk, SOCK_LINGER);
 727         release_sock(sk);
 728 }
 729 EXPORT_SYMBOL(sock_no_linger);
 730
 731 void sock_set_priority(struct sock *sk, u32 priority)
 732 {
 733         lock_sock(sk);
 734         sk->sk_priority = priority;
 735         release_sock(sk);
 736 }
 737 EXPORT_SYMBOL(sock_set_priority);
 738
 739 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 740 {
 741         lock_sock(sk);
 742         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 743                 sk->sk_sndtimeo = secs * HZ;
 744         else
 745                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 746         release_sock(sk);
 747 }
 748 EXPORT_SYMBOL(sock_set_sndtimeo);
 749
 750 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 751 {
 752         if (val)  {
 753                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 754                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 755                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 756                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 757         } else {
 758                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 759                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 760         }
 761 }
 762
 763 void sock_enable_timestamps(struct sock *sk)
 764 {
 765         lock_sock(sk);
 766         __sock_set_timestamps(sk, true, false, true);
 767         release_sock(sk);
 768 }
 769 EXPORT_SYMBOL(sock_enable_timestamps);
 770
 771 void sock_set_keepalive(struct sock *sk)
 772 {
 773         lock_sock(sk);
 774         if (sk->sk_prot->keepalive)
 775                 sk->sk_prot->keepalive(sk, true);
 776         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 777         release_sock(sk);
 778 }
 779 EXPORT_SYMBOL(sock_set_keepalive);
 780
 781 static void __sock_set_rcvbuf(struct sock *sk, int val)
 782 {
 783         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 784          * as a negative value.
 785          */
 786         val = min_t(int, val, INT_MAX / 2);
 787         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 788
 789         /* We double it on the way in to account for "struct sk_buff" etc.
 790          * overhead.   Applications assume that the SO_RCVBUF setting they make
 791          * will allow that much actual data to be received on that socket.
 792          *
 793          * Applications are unaware that "struct sk_buff" and other overheads
 794          * allocate from the receive buffer during socket buffer allocation.
 795          *
 796          * And after considering the possible alternatives, returning the value
 797          * we actually used in getsockopt is the most desirable behavior.
 798          */
 799         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 800 }
 801
 802 void sock_set_rcvbuf(struct sock *sk, int val)
 803 {
 804         lock_sock(sk);
 805         __sock_set_rcvbuf(sk, val);
 806         release_sock(sk);
 807 }
 808 EXPORT_SYMBOL(sock_set_rcvbuf);
 809
 810 void sock_set_mark(struct sock *sk, u32 val)
 811 {
 812         lock_sock(sk);
 813         sk->sk_mark = val;
 814         release_sock(sk);
 815 }
 816 EXPORT_SYMBOL(sock_set_mark);
 817
 818 /*
 819  *      This is meant for all protocols to use and covers goings on
 820  *      at the socket level. Everything here is generic.
 821  */
 822
 823 int sock_setsockopt(struct socket *sock, int level, int optname,
 824                     sockptr_t optval, unsigned int optlen)
 825 {
 826         struct sock_txtime sk_txtime;
 827         struct sock *sk = sock->sk;
 828         int val;
 829         int valbool;
 830         struct linger ling;
 831         int ret = 0;
 832
 833         /*
 834          *      Options without arguments
 835          */
 836
 837         if (optname == SO_BINDTODEVICE)
 838                 return sock_setbindtodevice(sk, optval, optlen);
 839
 840         if (optlen < sizeof(int))
 841                 return -EINVAL;
 842
 843         if (copy_from_sockptr(&val, optval, sizeof(val)))
 844                 return -EFAULT;
 845
 846         valbool = val ? 1 : 0;
 847
 848         lock_sock(sk);
 849
 850         switch (optname) {
 851         case SO_DEBUG:
 852                 if (val && !capable(CAP_NET_ADMIN))
 853                         ret = -EACCES;
 854                 else
 855                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 856                 break;
 857         case SO_REUSEADDR:
 858                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 859                 break;
 860         case SO_REUSEPORT:
 861                 sk->sk_reuseport = valbool;
 862                 break;
 863         case SO_TYPE:
 864         case SO_PROTOCOL:
 865         case SO_DOMAIN:
 866         case SO_ERROR:
 867                 ret = -ENOPROTOOPT;
 868                 break;
 869         case SO_DONTROUTE:
 870                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 871                 sk_dst_reset(sk);
 872                 break;
 873         case SO_BROADCAST:
 874                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 875                 break;
 876         case SO_SNDBUF:
 877                 /* Don't error on this BSD doesn't and if you think
 878                  * about it this is right. Otherwise apps have to
 879                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 880                  * are treated in BSD as hints
 881                  */
 882                 val = min_t(u32, val, sysctl_wmem_max);
 883 set_sndbuf:
 884                 /* Ensure val * 2 fits into an int, to prevent max_t()
 885                  * from treating it as a negative value.
 886                  */
 887                 val = min_t(int, val, INT_MAX / 2);
 888                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 889                 WRITE_ONCE(sk->sk_sndbuf,
 890                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
 891                 /* Wake up sending tasks if we upped the value. */
 892                 sk->sk_write_space(sk);
 893                 break;
 894
 895         case SO_SNDBUFFORCE:
 896                 if (!capable(CAP_NET_ADMIN)) {
 897                         ret = -EPERM;
 898                         break;
 899                 }
 900
 901                 /* No negative values (to prevent underflow, as val will be
 902                  * multiplied by 2).
 903                  */
 904                 if (val < 0)
 905                         val = 0;
 906                 goto set_sndbuf;
 907
 908         case SO_RCVBUF:
 909                 /* Don't error on this BSD doesn't and if you think
 910                  * about it this is right. Otherwise apps have to
 911                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 912                  * are treated in BSD as hints
 913                  */
 914                 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
 915                 break;
 916
 917         case SO_RCVBUFFORCE:
 918                 if (!capable(CAP_NET_ADMIN)) {
 919                         ret = -EPERM;
 920                         break;
 921                 }
 922
 923                 /* No negative values (to prevent underflow, as val will be
 924                  * multiplied by 2).
 925                  */
 926                 __sock_set_rcvbuf(sk, max(val, 0));
 927                 break;
 928
 929         case SO_KEEPALIVE:
 930                 if (sk->sk_prot->keepalive)
 931                         sk->sk_prot->keepalive(sk, valbool);
 932                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 933                 break;
 934
 935         case SO_OOBINLINE:
 936                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 937                 break;
 938
 939         case SO_NO_CHECK:
 940                 sk->sk_no_check_tx = valbool;
 941                 break;
 942
 943         case SO_PRIORITY:
 944                 if ((val >= 0 && val <= 6) ||
 945                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 946                         sk->sk_priority = val;
 947                 else
 948                         ret = -EPERM;
 949                 break;
 950
 951         case SO_LINGER:
 952                 if (optlen < sizeof(ling)) {
 953                         ret = -EINVAL;  /* 1003.1g */
 954                         break;
 955                 }
 956                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
 957                         ret = -EFAULT;
 958                         break;
 959                 }
 960                 if (!ling.l_onoff)
 961                         sock_reset_flag(sk, SOCK_LINGER);
 962                 else {
 963 #if (BITS_PER_LONG == 32)
 964                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 965                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 966                         else
 967 #endif
 968                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 969                         sock_set_flag(sk, SOCK_LINGER);
 970                 }
 971                 break;
 972
 973         case SO_BSDCOMPAT:
 974                 break;
 975
 976         case SO_PASSCRED:
 977                 if (valbool)
 978                         set_bit(SOCK_PASSCRED, &sock->flags);
 979                 else
 980                         clear_bit(SOCK_PASSCRED, &sock->flags);
 981                 break;
 982
 983         case SO_TIMESTAMP_OLD:
 984                 __sock_set_timestamps(sk, valbool, false, false);
 985                 break;
 986         case SO_TIMESTAMP_NEW:
 987                 __sock_set_timestamps(sk, valbool, true, false);
 988                 break;
 989         case SO_TIMESTAMPNS_OLD:
 990                 __sock_set_timestamps(sk, valbool, false, true);
 991                 break;
 992         case SO_TIMESTAMPNS_NEW:
 993                 __sock_set_timestamps(sk, valbool, true, true);
 994                 break;
 995         case SO_TIMESTAMPING_NEW:
 996         case SO_TIMESTAMPING_OLD:
 997                 if (val & ~SOF_TIMESTAMPING_MASK) {
 998                         ret = -EINVAL;
 999                         break;
1000                 }
1001
1002                 if (val & SOF_TIMESTAMPING_OPT_ID &&
1003                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1004                         if (sk->sk_protocol == IPPROTO_TCP &&
1005                             sk->sk_type == SOCK_STREAM) {
1006                                 if ((1 << sk->sk_state) &
1007                                     (TCPF_CLOSE | TCPF_LISTEN)) {
1008                                         ret = -EINVAL;
1009                                         break;
1010                                 }
1011                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
1012                         } else {
1013                                 sk->sk_tskey = 0;
1014                         }
1015                 }
1016
1017                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
1018                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1019                         ret = -EINVAL;
1020                         break;
1021                 }
1022
1023                 sk->sk_tsflags = val;
1024                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1025
1026                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1027                         sock_enable_timestamp(sk,
1028                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
1029                 else
1030                         sock_disable_timestamp(sk,
1031                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1032                 break;
1033
1034         case SO_RCVLOWAT:
1035                 if (val < 0)
1036                         val = INT_MAX;
1037                 if (sock->ops->set_rcvlowat)
1038                         ret = sock->ops->set_rcvlowat(sk, val);
1039                 else
1040                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1041                 break;
1042
1043         case SO_RCVTIMEO_OLD:
1044         case SO_RCVTIMEO_NEW:
1045                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1046                                        optlen, optname == SO_RCVTIMEO_OLD);
1047                 break;
1048
1049         case SO_SNDTIMEO_OLD:
1050         case SO_SNDTIMEO_NEW:
1051                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1052                                        optlen, optname == SO_SNDTIMEO_OLD);
1053                 break;
1054
1055         case SO_ATTACH_FILTER: {
1056                 struct sock_fprog fprog;
1057
1058                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1059                 if (!ret)
1060                         ret = sk_attach_filter(&fprog, sk);
1061                 break;
1062         }
1063         case SO_ATTACH_BPF:
1064                 ret = -EINVAL;
1065                 if (optlen == sizeof(u32)) {
1066                         u32 ufd;
1067
1068                         ret = -EFAULT;
1069                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1070                                 break;
1071
1072                         ret = sk_attach_bpf(ufd, sk);
1073                 }
1074                 break;
1075
1076         case SO_ATTACH_REUSEPORT_CBPF: {
1077                 struct sock_fprog fprog;
1078
1079                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1080                 if (!ret)
1081                         ret = sk_reuseport_attach_filter(&fprog, sk);
1082                 break;
1083         }
1084         case SO_ATTACH_REUSEPORT_EBPF:
1085                 ret = -EINVAL;
1086                 if (optlen == sizeof(u32)) {
1087                         u32 ufd;
1088
1089                         ret = -EFAULT;
1090                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1091                                 break;
1092
1093                         ret = sk_reuseport_attach_bpf(ufd, sk);
1094                 }
1095                 break;
1096
1097         case SO_DETACH_REUSEPORT_BPF:
1098                 ret = reuseport_detach_prog(sk);
1099                 break;
1100
1101         case SO_DETACH_FILTER:
1102                 ret = sk_detach_filter(sk);
1103                 break;
1104
1105         case SO_LOCK_FILTER:
1106                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1107                         ret = -EPERM;
1108                 else
1109                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1110                 break;
1111
1112         case SO_PASSSEC:
1113                 if (valbool)
1114                         set_bit(SOCK_PASSSEC, &sock->flags);
1115                 else
1116                         clear_bit(SOCK_PASSSEC, &sock->flags);
1117                 break;
1118         case SO_MARK:
1119                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1120                         ret = -EPERM;
1121                 } else if (val != sk->sk_mark) {
1122                         sk->sk_mark = val;
1123                         sk_dst_reset(sk);
1124                 }
1125                 break;
1126
1127         case SO_RXQ_OVFL:
1128                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1129                 break;
1130
1131         case SO_WIFI_STATUS:
1132                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1133                 break;
1134
1135         case SO_PEEK_OFF:
1136                 if (sock->ops->set_peek_off)
1137                         ret = sock->ops->set_peek_off(sk, val);
1138                 else
1139                         ret = -EOPNOTSUPP;
1140                 break;
1141
1142         case SO_NOFCS:
1143                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1144                 break;
1145
1146         case SO_SELECT_ERR_QUEUE:
1147                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1148                 break;
1149
1150 #ifdef CONFIG_NET_RX_BUSY_POLL
1151         case SO_BUSY_POLL:
1152                 /* allow unprivileged users to decrease the value */
1153                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1154                         ret = -EPERM;
1155                 else {
1156                         if (val < 0)
1157                                 ret = -EINVAL;
1158                         else
1159                                 sk->sk_ll_usec = val;
1160                 }
1161                 break;
1162 #endif
1163
1164         case SO_MAX_PACING_RATE:
1165                 {
1166                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1167
1168                 if (sizeof(ulval) != sizeof(val) &&
1169                     optlen >= sizeof(ulval) &&
1170                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171                         ret = -EFAULT;
1172                         break;
1173                 }
1174                 if (ulval != ~0UL)
1175                         cmpxchg(&sk->sk_pacing_status,
1176                                 SK_PACING_NONE,
1177                                 SK_PACING_NEEDED);
1178                 sk->sk_max_pacing_rate = ulval;
1179                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1180                 break;
1181                 }
1182         case SO_INCOMING_CPU:
1183                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1184                 break;
1185
1186         case SO_CNX_ADVICE:
1187                 if (val == 1)
1188                         dst_negative_advice(sk);
1189                 break;
1190
1191         case SO_ZEROCOPY:
1192                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1193                         if (!((sk->sk_type == SOCK_STREAM &&
1194                                sk->sk_protocol == IPPROTO_TCP) ||
1195                               (sk->sk_type == SOCK_DGRAM &&
1196                                sk->sk_protocol == IPPROTO_UDP)))
1197                                 ret = -ENOTSUPP;
1198                 } else if (sk->sk_family != PF_RDS) {
1199                         ret = -ENOTSUPP;
1200                 }
1201                 if (!ret) {
1202                         if (val < 0 || val > 1)
1203                                 ret = -EINVAL;
1204                         else
1205                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1206                 }
1207                 break;
1208
1209         case SO_TXTIME:
1210                 if (optlen != sizeof(struct sock_txtime)) {
1211                         ret = -EINVAL;
1212                         break;
1213                 } else if (copy_from_sockptr(&sk_txtime, optval,
1214                            sizeof(struct sock_txtime))) {
1215                         ret = -EFAULT;
1216                         break;
1217                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1218                         ret = -EINVAL;
1219                         break;
1220                 }
1221                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1222                  * scheduler has enough safe guards.
1223                  */
1224                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1225                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1226                         ret = -EPERM;
1227                         break;
1228                 }
1229                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1230                 sk->sk_clockid = sk_txtime.clockid;
1231                 sk->sk_txtime_deadline_mode =
1232                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1233                 sk->sk_txtime_report_errors =
1234                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1235                 break;
1236
1237         case SO_BINDTOIFINDEX:
1238                 ret = sock_bindtoindex_locked(sk, val);
1239                 break;
1240
1241         default:
1242                 ret = -ENOPROTOOPT;
1243                 break;
1244         }
1245         release_sock(sk);
1246         return ret;
1247 }
1248 EXPORT_SYMBOL(sock_setsockopt);
1249
1250
1251 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1252                           struct ucred *ucred)
1253 {
1254         ucred->pid = pid_vnr(pid);
1255         ucred->uid = ucred->gid = -1;
1256         if (cred) {
1257                 struct user_namespace *current_ns = current_user_ns();
1258
1259                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1260                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1261         }
1262 }
1263
1264 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1265 {
1266         struct user_namespace *user_ns = current_user_ns();
1267         int i;
1268
1269         for (i = 0; i < src->ngroups; i++)
1270                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1271                         return -EFAULT;
1272
1273         return 0;
1274 }
1275
1276 int sock_getsockopt(struct socket *sock, int level, int optname,
1277                     char __user *optval, int __user *optlen)
1278 {
1279         struct sock *sk = sock->sk;
1280
1281         union {
1282                 int val;
1283                 u64 val64;
1284                 unsigned long ulval;
1285                 struct linger ling;
1286                 struct old_timeval32 tm32;
1287                 struct __kernel_old_timeval tm;
1288                 struct  __kernel_sock_timeval stm;
1289                 struct sock_txtime txtime;
1290         } v;
1291
1292         int lv = sizeof(int);
1293         int len;
1294
1295         if (get_user(len, optlen))
1296                 return -EFAULT;
1297         if (len < 0)
1298                 return -EINVAL;
1299
1300         memset(&v, 0, sizeof(v));
1301
1302         switch (optname) {
1303         case SO_DEBUG:
1304                 v.val = sock_flag(sk, SOCK_DBG);
1305                 break;
1306
1307         case SO_DONTROUTE:
1308                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1309                 break;
1310
1311         case SO_BROADCAST:
1312                 v.val = sock_flag(sk, SOCK_BROADCAST);
1313                 break;
1314
1315         case SO_SNDBUF:
1316                 v.val = sk->sk_sndbuf;
1317                 break;
1318
1319         case SO_RCVBUF:
1320                 v.val = sk->sk_rcvbuf;
1321                 break;
1322
1323         case SO_REUSEADDR:
1324                 v.val = sk->sk_reuse;
1325                 break;
1326
1327         case SO_REUSEPORT:
1328                 v.val = sk->sk_reuseport;
1329                 break;
1330
1331         case SO_KEEPALIVE:
1332                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1333                 break;
1334
1335         case SO_TYPE:
1336                 v.val = sk->sk_type;
1337                 break;
1338
1339         case SO_PROTOCOL:
1340                 v.val = sk->sk_protocol;
1341                 break;
1342
1343         case SO_DOMAIN:
1344                 v.val = sk->sk_family;
1345                 break;
1346
1347         case SO_ERROR:
1348                 v.val = -sock_error(sk);
1349                 if (v.val == 0)
1350                         v.val = xchg(&sk->sk_err_soft, 0);
1351                 break;
1352
1353         case SO_OOBINLINE:
1354                 v.val = sock_flag(sk, SOCK_URGINLINE);
1355                 break;
1356
1357         case SO_NO_CHECK:
1358                 v.val = sk->sk_no_check_tx;
1359                 break;
1360
1361         case SO_PRIORITY:
1362                 v.val = sk->sk_priority;
1363                 break;
1364
1365         case SO_LINGER:
1366                 lv              = sizeof(v.ling);
1367                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1368                 v.ling.l_linger = sk->sk_lingertime / HZ;
1369                 break;
1370
1371         case SO_BSDCOMPAT:
1372                 break;
1373
1374         case SO_TIMESTAMP_OLD:
1375                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1376                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1377                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1378                 break;
1379
1380         case SO_TIMESTAMPNS_OLD:
1381                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1382                 break;
1383
1384         case SO_TIMESTAMP_NEW:
1385                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1386                 break;
1387
1388         case SO_TIMESTAMPNS_NEW:
1389                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1390                 break;
1391
1392         case SO_TIMESTAMPING_OLD:
1393                 v.val = sk->sk_tsflags;
1394                 break;
1395
1396         case SO_RCVTIMEO_OLD:
1397         case SO_RCVTIMEO_NEW:
1398                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1399                 break;
1400
1401         case SO_SNDTIMEO_OLD:
1402         case SO_SNDTIMEO_NEW:
1403                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1404                 break;
1405
1406         case SO_RCVLOWAT:
1407                 v.val = sk->sk_rcvlowat;
1408                 break;
1409
1410         case SO_SNDLOWAT:
1411                 v.val = 1;
1412                 break;
1413
1414         case SO_PASSCRED:
1415                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1416                 break;
1417
1418         case SO_PEERCRED:
1419         {
1420                 struct ucred peercred;
1421                 if (len > sizeof(peercred))
1422                         len = sizeof(peercred);
1423                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1424                 if (copy_to_user(optval, &peercred, len))
1425                         return -EFAULT;
1426                 goto lenout;
1427         }
1428
1429         case SO_PEERGROUPS:
1430         {
1431                 int ret, n;
1432
1433                 if (!sk->sk_peer_cred)
1434                         return -ENODATA;
1435
1436                 n = sk->sk_peer_cred->group_info->ngroups;
1437                 if (len < n * sizeof(gid_t)) {
1438                         len = n * sizeof(gid_t);
1439                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1440                 }
1441                 len = n * sizeof(gid_t);
1442
1443                 ret = groups_to_user((gid_t __user *)optval,
1444                                      sk->sk_peer_cred->group_info);
1445                 if (ret)
1446                         return ret;
1447                 goto lenout;
1448         }
1449
1450         case SO_PEERNAME:
1451         {
1452                 char address[128];
1453
1454                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1455                 if (lv < 0)
1456                         return -ENOTCONN;
1457                 if (lv < len)
1458                         return -EINVAL;
1459                 if (copy_to_user(optval, address, len))
1460                         return -EFAULT;
1461                 goto lenout;
1462         }
1463
1464         /* Dubious BSD thing... Probably nobody even uses it, but
1465          * the UNIX standard wants it for whatever reason... -DaveM
1466          */
1467         case SO_ACCEPTCONN:
1468                 v.val = sk->sk_state == TCP_LISTEN;
1469                 break;
1470
1471         case SO_PASSSEC:
1472                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1473                 break;
1474
1475         case SO_PEERSEC:
1476                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1477
1478         case SO_MARK:
1479                 v.val = sk->sk_mark;
1480                 break;
1481
1482         case SO_RXQ_OVFL:
1483                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1484                 break;
1485
1486         case SO_WIFI_STATUS:
1487                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1488                 break;
1489
1490         case SO_PEEK_OFF:
1491                 if (!sock->ops->set_peek_off)
1492                         return -EOPNOTSUPP;
1493
1494                 v.val = sk->sk_peek_off;
1495                 break;
1496         case SO_NOFCS:
1497                 v.val = sock_flag(sk, SOCK_NOFCS);
1498                 break;
1499
1500         case SO_BINDTODEVICE:
1501                 return sock_getbindtodevice(sk, optval, optlen, len);
1502
1503         case SO_GET_FILTER:
1504                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1505                 if (len < 0)
1506                         return len;
1507
1508                 goto lenout;
1509
1510         case SO_LOCK_FILTER:
1511                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1512                 break;
1513
1514         case SO_BPF_EXTENSIONS:
1515                 v.val = bpf_tell_extensions();
1516                 break;
1517
1518         case SO_SELECT_ERR_QUEUE:
1519                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1520                 break;
1521
1522 #ifdef CONFIG_NET_RX_BUSY_POLL
1523         case SO_BUSY_POLL:
1524                 v.val = sk->sk_ll_usec;
1525                 break;
1526 #endif
1527
1528         case SO_MAX_PACING_RATE:
1529                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1530                         lv = sizeof(v.ulval);
1531                         v.ulval = sk->sk_max_pacing_rate;
1532                 } else {
1533                         /* 32bit version */
1534                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1535                 }
1536                 break;
1537
1538         case SO_INCOMING_CPU:
1539                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1540                 break;
1541
1542         case SO_MEMINFO:
1543         {
1544                 u32 meminfo[SK_MEMINFO_VARS];
1545
1546                 sk_get_meminfo(sk, meminfo);
1547
1548                 len = min_t(unsigned int, len, sizeof(meminfo));
1549                 if (copy_to_user(optval, &meminfo, len))
1550                         return -EFAULT;
1551
1552                 goto lenout;
1553         }
1554
1555 #ifdef CONFIG_NET_RX_BUSY_POLL
1556         case SO_INCOMING_NAPI_ID:
1557                 v.val = READ_ONCE(sk->sk_napi_id);
1558
1559                 /* aggregate non-NAPI IDs down to 0 */
1560                 if (v.val < MIN_NAPI_ID)
1561                         v.val = 0;
1562
1563                 break;
1564 #endif
1565
1566         case SO_COOKIE:
1567                 lv = sizeof(u64);
1568                 if (len < lv)
1569                         return -EINVAL;
1570                 v.val64 = sock_gen_cookie(sk);
1571                 break;
1572
1573         case SO_ZEROCOPY:
1574                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1575                 break;
1576
1577         case SO_TXTIME:
1578                 lv = sizeof(v.txtime);
1579                 v.txtime.clockid = sk->sk_clockid;
1580                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1581                                   SOF_TXTIME_DEADLINE_MODE : 0;
1582                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1583                                   SOF_TXTIME_REPORT_ERRORS : 0;
1584                 break;
1585
1586         case SO_BINDTOIFINDEX:
1587                 v.val = sk->sk_bound_dev_if;
1588                 break;
1589
1590         default:
1591                 /* We implement the SO_SNDLOWAT etc to not be settable
1592                  * (1003.1g 7).
1593                  */
1594                 return -ENOPROTOOPT;
1595         }
1596
1597         if (len > lv)
1598                 len = lv;
1599         if (copy_to_user(optval, &v, len))
1600                 return -EFAULT;
1601 lenout:
1602         if (put_user(len, optlen))
1603                 return -EFAULT;
1604         return 0;
1605 }
1606
1607 /*
1608  * Initialize an sk_lock.
1609  *
1610  * (We also register the sk_lock with the lock validator.)
1611  */
1612 static inline void sock_lock_init(struct sock *sk)
1613 {
1614         if (sk->sk_kern_sock)
1615                 sock_lock_init_class_and_name(
1616                         sk,
1617                         af_family_kern_slock_key_strings[sk->sk_family],
1618                         af_family_kern_slock_keys + sk->sk_family,
1619                         af_family_kern_key_strings[sk->sk_family],
1620                         af_family_kern_keys + sk->sk_family);
1621         else
1622                 sock_lock_init_class_and_name(
1623                         sk,
1624                         af_family_slock_key_strings[sk->sk_family],
1625                         af_family_slock_keys + sk->sk_family,
1626                         af_family_key_strings[sk->sk_family],
1627                         af_family_keys + sk->sk_family);
1628 }
1629
1630 /*
1631  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1632  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1633  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1634  */
1635 static void sock_copy(struct sock *nsk, const struct sock *osk)
1636 {
1637         const struct proto *prot = READ_ONCE(osk->sk_prot);
1638 #ifdef CONFIG_SECURITY_NETWORK
1639         void *sptr = nsk->sk_security;
1640 #endif
1641         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1642
1643         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1644                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1645
1646 #ifdef CONFIG_SECURITY_NETWORK
1647         nsk->sk_security = sptr;
1648         security_sk_clone(osk, nsk);
1649 #endif
1650 }
1651
1652 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1653                 int family)
1654 {
1655         struct sock *sk;
1656         struct kmem_cache *slab;
1657
1658         slab = prot->slab;
1659         if (slab != NULL) {
1660                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1661                 if (!sk)
1662                         return sk;
1663                 if (want_init_on_alloc(priority))
1664                         sk_prot_clear_nulls(sk, prot->obj_size);
1665         } else
1666                 sk = kmalloc(prot->obj_size, priority);
1667
1668         if (sk != NULL) {
1669                 if (security_sk_alloc(sk, family, priority))
1670                         goto out_free;
1671
1672                 if (!try_module_get(prot->owner))
1673                         goto out_free_sec;
1674                 sk_tx_queue_clear(sk);
1675         }
1676
1677         return sk;
1678
1679 out_free_sec:
1680         security_sk_free(sk);
1681 out_free:
1682         if (slab != NULL)
1683                 kmem_cache_free(slab, sk);
1684         else
1685                 kfree(sk);
1686         return NULL;
1687 }
1688
1689 static void sk_prot_free(struct proto *prot, struct sock *sk)
1690 {
1691         struct kmem_cache *slab;
1692         struct module *owner;
1693
1694         owner = prot->owner;
1695         slab = prot->slab;
1696
1697         cgroup_sk_free(&sk->sk_cgrp_data);
1698         mem_cgroup_sk_free(sk);
1699         security_sk_free(sk);
1700         if (slab != NULL)
1701                 kmem_cache_free(slab, sk);
1702         else
1703                 kfree(sk);
1704         module_put(owner);
1705 }
1706
1707 /**
1708  *      sk_alloc - All socket objects are allocated here
1709  *      @net: the applicable net namespace
1710  *      @family: protocol family
1711  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1712  *      @prot: struct proto associated with this new sock instance
1713  *      @kern: is this to be a kernel socket?
1714  */
1715 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1716                       struct proto *prot, int kern)
1717 {
1718         struct sock *sk;
1719
1720         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1721         if (sk) {
1722                 sk->sk_family = family;
1723                 /*
1724                  * See comment in struct sock definition to understand
1725                  * why we need sk_prot_creator -acme
1726                  */
1727                 sk->sk_prot = sk->sk_prot_creator = prot;
1728                 sk->sk_kern_sock = kern;
1729                 sock_lock_init(sk);
1730                 sk->sk_net_refcnt = kern ? 0 : 1;
1731                 if (likely(sk->sk_net_refcnt)) {
1732                         get_net(net);
1733                         sock_inuse_add(net, 1);
1734                 }
1735
1736                 sock_net_set(sk, net);
1737                 refcount_set(&sk->sk_wmem_alloc, 1);
1738
1739                 mem_cgroup_sk_alloc(sk);
1740                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1741                 sock_update_classid(&sk->sk_cgrp_data);
1742                 sock_update_netprioidx(&sk->sk_cgrp_data);
1743                 sk_tx_queue_clear(sk);
1744         }
1745
1746         return sk;
1747 }
1748 EXPORT_SYMBOL(sk_alloc);
1749
1750 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1751  * grace period. This is the case for UDP sockets and TCP listeners.
1752  */
1753 static void __sk_destruct(struct rcu_head *head)
1754 {
1755         struct sock *sk = container_of(head, struct sock, sk_rcu);
1756         struct sk_filter *filter;
1757
1758         if (sk->sk_destruct)
1759                 sk->sk_destruct(sk);
1760
1761         filter = rcu_dereference_check(sk->sk_filter,
1762                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1763         if (filter) {
1764                 sk_filter_uncharge(sk, filter);
1765                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1766         }
1767
1768         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1769
1770 #ifdef CONFIG_BPF_SYSCALL
1771         bpf_sk_storage_free(sk);
1772 #endif
1773
1774         if (atomic_read(&sk->sk_omem_alloc))
1775                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1776                          __func__, atomic_read(&sk->sk_omem_alloc));
1777
1778         if (sk->sk_frag.page) {
1779                 put_page(sk->sk_frag.page);
1780                 sk->sk_frag.page = NULL;
1781         }
1782
1783         if (sk->sk_peer_cred)
1784                 put_cred(sk->sk_peer_cred);
1785         put_pid(sk->sk_peer_pid);
1786         if (likely(sk->sk_net_refcnt))
1787                 put_net(sock_net(sk));
1788         sk_prot_free(sk->sk_prot_creator, sk);
1789 }
1790
1791 void sk_destruct(struct sock *sk)
1792 {
1793         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1794
1795         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1796                 reuseport_detach_sock(sk);
1797                 use_call_rcu = true;
1798         }
1799
1800         if (use_call_rcu)
1801                 call_rcu(&sk->sk_rcu, __sk_destruct);
1802         else
1803                 __sk_destruct(&sk->sk_rcu);
1804 }
1805
1806 static void __sk_free(struct sock *sk)
1807 {
1808         if (likely(sk->sk_net_refcnt))
1809                 sock_inuse_add(sock_net(sk), -1);
1810
1811         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1812                 sock_diag_broadcast_destroy(sk);
1813         else
1814                 sk_destruct(sk);
1815 }
1816
1817 void sk_free(struct sock *sk)
1818 {
1819         /*
1820          * We subtract one from sk_wmem_alloc and can know if
1821          * some packets are still in some tx queue.
1822          * If not null, sock_wfree() will call __sk_free(sk) later
1823          */
1824         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1825                 __sk_free(sk);
1826 }
1827 EXPORT_SYMBOL(sk_free);
1828
1829 static void sk_init_common(struct sock *sk)
1830 {
1831         skb_queue_head_init(&sk->sk_receive_queue);
1832         skb_queue_head_init(&sk->sk_write_queue);
1833         skb_queue_head_init(&sk->sk_error_queue);
1834
1835         rwlock_init(&sk->sk_callback_lock);
1836         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1837                         af_rlock_keys + sk->sk_family,
1838                         af_family_rlock_key_strings[sk->sk_family]);
1839         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1840                         af_wlock_keys + sk->sk_family,
1841                         af_family_wlock_key_strings[sk->sk_family]);
1842         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1843                         af_elock_keys + sk->sk_family,
1844                         af_family_elock_key_strings[sk->sk_family]);
1845         lockdep_set_class_and_name(&sk->sk_callback_lock,
1846                         af_callback_keys + sk->sk_family,
1847                         af_family_clock_key_strings[sk->sk_family]);
1848 }
1849
1850 /**
1851  *      sk_clone_lock - clone a socket, and lock its clone
1852  *      @sk: the socket to clone
1853  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1854  *
1855  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1856  */
1857 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1858 {
1859         struct proto *prot = READ_ONCE(sk->sk_prot);
1860         struct sock *newsk;
1861         bool is_charged = true;
1862
1863         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1864         if (newsk != NULL) {
1865                 struct sk_filter *filter;
1866
1867                 sock_copy(newsk, sk);
1868
1869                 newsk->sk_prot_creator = prot;
1870
1871                 /* SANITY */
1872                 if (likely(newsk->sk_net_refcnt))
1873                         get_net(sock_net(newsk));
1874                 sk_node_init(&newsk->sk_node);
1875                 sock_lock_init(newsk);
1876                 bh_lock_sock(newsk);
1877                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1878                 newsk->sk_backlog.len = 0;
1879
1880                 atomic_set(&newsk->sk_rmem_alloc, 0);
1881                 /*
1882                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1883                  */
1884                 refcount_set(&newsk->sk_wmem_alloc, 1);
1885                 atomic_set(&newsk->sk_omem_alloc, 0);
1886                 sk_init_common(newsk);
1887
1888                 newsk->sk_dst_cache     = NULL;
1889                 newsk->sk_dst_pending_confirm = 0;
1890                 newsk->sk_wmem_queued   = 0;
1891                 newsk->sk_forward_alloc = 0;
1892                 atomic_set(&newsk->sk_drops, 0);
1893                 newsk->sk_send_head     = NULL;
1894                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1895                 atomic_set(&newsk->sk_zckey, 0);
1896
1897                 sock_reset_flag(newsk, SOCK_DONE);
1898
1899                 /* sk->sk_memcg will be populated at accept() time */
1900                 newsk->sk_memcg = NULL;
1901
1902                 cgroup_sk_clone(&newsk->sk_cgrp_data);
1903
1904                 rcu_read_lock();
1905                 filter = rcu_dereference(sk->sk_filter);
1906                 if (filter != NULL)
1907                         /* though it's an empty new sock, the charging may fail
1908                          * if sysctl_optmem_max was changed between creation of
1909                          * original socket and cloning
1910                          */
1911                         is_charged = sk_filter_charge(newsk, filter);
1912                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1913                 rcu_read_unlock();
1914
1915                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1916                         /* We need to make sure that we don't uncharge the new
1917                          * socket if we couldn't charge it in the first place
1918                          * as otherwise we uncharge the parent's filter.
1919                          */
1920                         if (!is_charged)
1921                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1922                         sk_free_unlock_clone(newsk);
1923                         newsk = NULL;
1924                         goto out;
1925                 }
1926                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1927
1928                 if (bpf_sk_storage_clone(sk, newsk)) {
1929                         sk_free_unlock_clone(newsk);
1930                         newsk = NULL;
1931                         goto out;
1932                 }
1933
1934                 /* Clear sk_user_data if parent had the pointer tagged
1935                  * as not suitable for copying when cloning.
1936                  */
1937                 if (sk_user_data_is_nocopy(newsk))
1938                         newsk->sk_user_data = NULL;
1939
1940                 newsk->sk_err      = 0;
1941                 newsk->sk_err_soft = 0;
1942                 newsk->sk_priority = 0;
1943                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1944                 if (likely(newsk->sk_net_refcnt))
1945                         sock_inuse_add(sock_net(newsk), 1);
1946
1947                 /*
1948                  * Before updating sk_refcnt, we must commit prior changes to memory
1949                  * (Documentation/RCU/rculist_nulls.rst for details)
1950                  */
1951                 smp_wmb();
1952                 refcount_set(&newsk->sk_refcnt, 2);
1953
1954                 /*
1955                  * Increment the counter in the same struct proto as the master
1956                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1957                  * is the same as sk->sk_prot->socks, as this field was copied
1958                  * with memcpy).
1959                  *
1960                  * This _changes_ the previous behaviour, where
1961                  * tcp_create_openreq_child always was incrementing the
1962                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1963                  * to be taken into account in all callers. -acme
1964                  */
1965                 sk_refcnt_debug_inc(newsk);
1966                 sk_set_socket(newsk, NULL);
1967                 sk_tx_queue_clear(newsk);
1968                 RCU_INIT_POINTER(newsk->sk_wq, NULL);
1969
1970                 if (newsk->sk_prot->sockets_allocated)
1971                         sk_sockets_allocated_inc(newsk);
1972
1973                 if (sock_needs_netstamp(sk) &&
1974                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1975                         net_enable_timestamp();
1976         }
1977 out:
1978         return newsk;
1979 }
1980 EXPORT_SYMBOL_GPL(sk_clone_lock);
1981
1982 void sk_free_unlock_clone(struct sock *sk)
1983 {
1984         /* It is still raw copy of parent, so invalidate
1985          * destructor and make plain sk_free() */
1986         sk->sk_destruct = NULL;
1987         bh_unlock_sock(sk);
1988         sk_free(sk);
1989 }
1990 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1991
1992 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1993 {
1994         u32 max_segs = 1;
1995
1996         sk_dst_set(sk, dst);
1997         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1998         if (sk->sk_route_caps & NETIF_F_GSO)
1999                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2000         sk->sk_route_caps &= ~sk->sk_route_nocaps;
2001         if (sk_can_gso(sk)) {
2002                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2003                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2004                 } else {
2005                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2006                         sk->sk_gso_max_size = dst->dev->gso_max_size;
2007                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2008                 }
2009         }
2010         sk->sk_gso_max_segs = max_segs;
2011 }
2012 EXPORT_SYMBOL_GPL(sk_setup_caps);
2013
2014 /*
2015  *      Simple resource managers for sockets.
2016  */
2017
2018
2019 /*
2020  * Write buffer destructor automatically called from kfree_skb.
2021  */
2022 void sock_wfree(struct sk_buff *skb)
2023 {
2024         struct sock *sk = skb->sk;
2025         unsigned int len = skb->truesize;
2026
2027         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2028                 /*
2029                  * Keep a reference on sk_wmem_alloc, this will be released
2030                  * after sk_write_space() call
2031                  */
2032                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2033                 sk->sk_write_space(sk);
2034                 len = 1;
2035         }
2036         /*
2037          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2038          * could not do because of in-flight packets
2039          */
2040         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2041                 __sk_free(sk);
2042 }
2043 EXPORT_SYMBOL(sock_wfree);
2044
2045 /* This variant of sock_wfree() is used by TCP,
2046  * since it sets SOCK_USE_WRITE_QUEUE.
2047  */
2048 void __sock_wfree(struct sk_buff *skb)
2049 {
2050         struct sock *sk = skb->sk;
2051
2052         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2053                 __sk_free(sk);
2054 }
2055
2056 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2057 {
2058         skb_orphan(skb);
2059         skb->sk = sk;
2060 #ifdef CONFIG_INET
2061         if (unlikely(!sk_fullsock(sk))) {
2062                 skb->destructor = sock_edemux;
2063                 sock_hold(sk);
2064                 return;
2065         }
2066 #endif
2067         skb->destructor = sock_wfree;
2068         skb_set_hash_from_sk(skb, sk);
2069         /*
2070          * We used to take a refcount on sk, but following operation
2071          * is enough to guarantee sk_free() wont free this sock until
2072          * all in-flight packets are completed
2073          */
2074         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2075 }
2076 EXPORT_SYMBOL(skb_set_owner_w);
2077
2078 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2079 {
2080 #ifdef CONFIG_TLS_DEVICE
2081         /* Drivers depend on in-order delivery for crypto offload,
2082          * partial orphan breaks out-of-order-OK logic.
2083          */
2084         if (skb->decrypted)
2085                 return false;
2086 #endif
2087         return (skb->destructor == sock_wfree ||
2088                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2089 }
2090
2091 /* This helper is used by netem, as it can hold packets in its
2092  * delay queue. We want to allow the owner socket to send more
2093  * packets, as if they were already TX completed by a typical driver.
2094  * But we also want to keep skb->sk set because some packet schedulers
2095  * rely on it (sch_fq for example).
2096  */
2097 void skb_orphan_partial(struct sk_buff *skb)
2098 {
2099         if (skb_is_tcp_pure_ack(skb))
2100                 return;
2101
2102         if (can_skb_orphan_partial(skb)) {
2103                 struct sock *sk = skb->sk;
2104
2105                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2106                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2107                         skb->destructor = sock_efree;
2108                 }
2109         } else {
2110                 skb_orphan(skb);
2111         }
2112 }
2113 EXPORT_SYMBOL(skb_orphan_partial);
2114
2115 /*
2116  * Read buffer destructor automatically called from kfree_skb.
2117  */
2118 void sock_rfree(struct sk_buff *skb)
2119 {
2120         struct sock *sk = skb->sk;
2121         unsigned int len = skb->truesize;
2122
2123         atomic_sub(len, &sk->sk_rmem_alloc);
2124         sk_mem_uncharge(sk, len);
2125 }
2126 EXPORT_SYMBOL(sock_rfree);
2127
2128 /*
2129  * Buffer destructor for skbs that are not used directly in read or write
2130  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2131  */
2132 void sock_efree(struct sk_buff *skb)
2133 {
2134         sock_put(skb->sk);
2135 }
2136 EXPORT_SYMBOL(sock_efree);
2137
2138 /* Buffer destructor for prefetch/receive path where reference count may
2139  * not be held, e.g. for listen sockets.
2140  */
2141 #ifdef CONFIG_INET
2142 void sock_pfree(struct sk_buff *skb)
2143 {
2144         if (sk_is_refcounted(skb->sk))
2145                 sock_gen_put(skb->sk);
2146 }
2147 EXPORT_SYMBOL(sock_pfree);
2148 #endif /* CONFIG_INET */
2149
2150 kuid_t sock_i_uid(struct sock *sk)
2151 {
2152         kuid_t uid;
2153
2154         read_lock_bh(&sk->sk_callback_lock);
2155         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2156         read_unlock_bh(&sk->sk_callback_lock);
2157         return uid;
2158 }
2159 EXPORT_SYMBOL(sock_i_uid);
2160
2161 unsigned long sock_i_ino(struct sock *sk)
2162 {
2163         unsigned long ino;
2164
2165         read_lock_bh(&sk->sk_callback_lock);
2166         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2167         read_unlock_bh(&sk->sk_callback_lock);
2168         return ino;
2169 }
2170 EXPORT_SYMBOL(sock_i_ino);
2171
2172 /*
2173  * Allocate a skb from the socket's send buffer.
2174  */
2175 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2176                              gfp_t priority)
2177 {
2178         if (force ||
2179             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2180                 struct sk_buff *skb = alloc_skb(size, priority);
2181
2182                 if (skb) {
2183                         skb_set_owner_w(skb, sk);
2184                         return skb;
2185                 }
2186         }
2187         return NULL;
2188 }
2189 EXPORT_SYMBOL(sock_wmalloc);
2190
2191 static void sock_ofree(struct sk_buff *skb)
2192 {
2193         struct sock *sk = skb->sk;
2194
2195         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2196 }
2197
2198 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2199                              gfp_t priority)
2200 {
2201         struct sk_buff *skb;
2202
2203         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2204         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2205             sysctl_optmem_max)
2206                 return NULL;
2207
2208         skb = alloc_skb(size, priority);
2209         if (!skb)
2210                 return NULL;
2211
2212         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2213         skb->sk = sk;
2214         skb->destructor = sock_ofree;
2215         return skb;
2216 }
2217
2218 /*
2219  * Allocate a memory block from the socket's option memory buffer.
2220  */
2221 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2222 {
2223         if ((unsigned int)size <= sysctl_optmem_max &&
2224             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2225                 void *mem;
2226                 /* First do the add, to avoid the race if kmalloc
2227                  * might sleep.
2228                  */
2229                 atomic_add(size, &sk->sk_omem_alloc);
2230                 mem = kmalloc(size, priority);
2231                 if (mem)
2232                         return mem;
2233                 atomic_sub(size, &sk->sk_omem_alloc);
2234         }
2235         return NULL;
2236 }
2237 EXPORT_SYMBOL(sock_kmalloc);
2238
2239 /* Free an option memory block. Note, we actually want the inline
2240  * here as this allows gcc to detect the nullify and fold away the
2241  * condition entirely.
2242  */
2243 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2244                                   const bool nullify)
2245 {
2246         if (WARN_ON_ONCE(!mem))
2247                 return;
2248         if (nullify)
2249                 kfree_sensitive(mem);
2250         else
2251                 kfree(mem);
2252         atomic_sub(size, &sk->sk_omem_alloc);
2253 }
2254
2255 void sock_kfree_s(struct sock *sk, void *mem, int size)
2256 {
2257         __sock_kfree_s(sk, mem, size, false);
2258 }
2259 EXPORT_SYMBOL(sock_kfree_s);
2260
2261 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2262 {
2263         __sock_kfree_s(sk, mem, size, true);
2264 }
2265 EXPORT_SYMBOL(sock_kzfree_s);
2266
2267 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2268    I think, these locks should be removed for datagram sockets.
2269  */
2270 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2271 {
2272         DEFINE_WAIT(wait);
2273
2274         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2275         for (;;) {
2276                 if (!timeo)
2277                         break;
2278                 if (signal_pending(current))
2279                         break;
2280                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2281                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2282                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2283                         break;
2284                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2285                         break;
2286                 if (sk->sk_err)
2287                         break;
2288                 timeo = schedule_timeout(timeo);
2289         }
2290         finish_wait(sk_sleep(sk), &wait);
2291         return timeo;
2292 }
2293
2294
2295 /*
2296  *      Generic send/receive buffer handlers
2297  */
2298
2299 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2300                                      unsigned long data_len, int noblock,
2301                                      int *errcode, int max_page_order)
2302 {
2303         struct sk_buff *skb;
2304         long timeo;
2305         int err;
2306
2307         timeo = sock_sndtimeo(sk, noblock);
2308         for (;;) {
2309                 err = sock_error(sk);
2310                 if (err != 0)
2311                         goto failure;
2312
2313                 err = -EPIPE;
2314                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2315                         goto failure;
2316
2317                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2318                         break;
2319
2320                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2321                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2322                 err = -EAGAIN;
2323                 if (!timeo)
2324                         goto failure;
2325                 if (signal_pending(current))
2326                         goto interrupted;
2327                 timeo = sock_wait_for_wmem(sk, timeo);
2328         }
2329         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2330                                    errcode, sk->sk_allocation);
2331         if (skb)
2332                 skb_set_owner_w(skb, sk);
2333         return skb;
2334
2335 interrupted:
2336         err = sock_intr_errno(timeo);
2337 failure:
2338         *errcode = err;
2339         return NULL;
2340 }
2341 EXPORT_SYMBOL(sock_alloc_send_pskb);
2342
2343 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2344                                     int noblock, int *errcode)
2345 {
2346         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2347 }
2348 EXPORT_SYMBOL(sock_alloc_send_skb);
2349
2350 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2351                      struct sockcm_cookie *sockc)
2352 {
2353         u32 tsflags;
2354
2355         switch (cmsg->cmsg_type) {
2356         case SO_MARK:
2357                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2358                         return -EPERM;
2359                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2360                         return -EINVAL;
2361                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2362                 break;
2363         case SO_TIMESTAMPING_OLD:
2364                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2365                         return -EINVAL;
2366
2367                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2368                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2369                         return -EINVAL;
2370
2371                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2372                 sockc->tsflags |= tsflags;
2373                 break;
2374         case SCM_TXTIME:
2375                 if (!sock_flag(sk, SOCK_TXTIME))
2376                         return -EINVAL;
2377                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2378                         return -EINVAL;
2379                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2380                 break;
2381         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2382         case SCM_RIGHTS:
2383         case SCM_CREDENTIALS:
2384                 break;
2385         default:
2386                 return -EINVAL;
2387         }
2388         return 0;
2389 }
2390 EXPORT_SYMBOL(__sock_cmsg_send);
2391
2392 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2393                    struct sockcm_cookie *sockc)
2394 {
2395         struct cmsghdr *cmsg;
2396         int ret;
2397
2398         for_each_cmsghdr(cmsg, msg) {
2399                 if (!CMSG_OK(msg, cmsg))
2400                         return -EINVAL;
2401                 if (cmsg->cmsg_level != SOL_SOCKET)
2402                         continue;
2403                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2404                 if (ret)
2405                         return ret;
2406         }
2407         return 0;
2408 }
2409 EXPORT_SYMBOL(sock_cmsg_send);
2410
2411 static void sk_enter_memory_pressure(struct sock *sk)
2412 {
2413         if (!sk->sk_prot->enter_memory_pressure)
2414                 return;
2415
2416         sk->sk_prot->enter_memory_pressure(sk);
2417 }
2418
2419 static void sk_leave_memory_pressure(struct sock *sk)
2420 {
2421         if (sk->sk_prot->leave_memory_pressure) {
2422                 sk->sk_prot->leave_memory_pressure(sk);
2423         } else {
2424                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2425
2426                 if (memory_pressure && READ_ONCE(*memory_pressure))
2427                         WRITE_ONCE(*memory_pressure, 0);
2428         }
2429 }
2430
2431 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2432 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2433
2434 /**
2435  * skb_page_frag_refill - check that a page_frag contains enough room
2436  * @sz: minimum size of the fragment we want to get
2437  * @pfrag: pointer to page_frag
2438  * @gfp: priority for memory allocation
2439  *
2440  * Note: While this allocator tries to use high order pages, there is
2441  * no guarantee that allocations succeed. Therefore, @sz MUST be
2442  * less or equal than PAGE_SIZE.
2443  */
2444 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2445 {
2446         if (pfrag->page) {
2447                 if (page_ref_count(pfrag->page) == 1) {
2448                         pfrag->offset = 0;
2449                         return true;
2450                 }
2451                 if (pfrag->offset + sz <= pfrag->size)
2452                         return true;
2453                 put_page(pfrag->page);
2454         }
2455
2456         pfrag->offset = 0;
2457         if (SKB_FRAG_PAGE_ORDER &&
2458             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2459                 /* Avoid direct reclaim but allow kswapd to wake */
2460                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2461                                           __GFP_COMP | __GFP_NOWARN |
2462                                           __GFP_NORETRY,
2463                                           SKB_FRAG_PAGE_ORDER);
2464                 if (likely(pfrag->page)) {
2465                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2466                         return true;
2467                 }
2468         }
2469         pfrag->page = alloc_page(gfp);
2470         if (likely(pfrag->page)) {
2471                 pfrag->size = PAGE_SIZE;
2472                 return true;
2473         }
2474         return false;
2475 }
2476 EXPORT_SYMBOL(skb_page_frag_refill);
2477
2478 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2479 {
2480         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2481                 return true;
2482
2483         sk_enter_memory_pressure(sk);
2484         sk_stream_moderate_sndbuf(sk);
2485         return false;
2486 }
2487 EXPORT_SYMBOL(sk_page_frag_refill);
2488
2489 static void __lock_sock(struct sock *sk)
2490         __releases(&sk->sk_lock.slock)
2491         __acquires(&sk->sk_lock.slock)
2492 {
2493         DEFINE_WAIT(wait);
2494
2495         for (;;) {
2496                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2497                                         TASK_UNINTERRUPTIBLE);
2498                 spin_unlock_bh(&sk->sk_lock.slock);
2499                 schedule();
2500                 spin_lock_bh(&sk->sk_lock.slock);
2501                 if (!sock_owned_by_user(sk))
2502                         break;
2503         }
2504         finish_wait(&sk->sk_lock.wq, &wait);
2505 }
2506
2507 void __release_sock(struct sock *sk)
2508         __releases(&sk->sk_lock.slock)
2509         __acquires(&sk->sk_lock.slock)
2510 {
2511         struct sk_buff *skb, *next;
2512
2513         while ((skb = sk->sk_backlog.head) != NULL) {
2514                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2515
2516                 spin_unlock_bh(&sk->sk_lock.slock);
2517
2518                 do {
2519                         next = skb->next;
2520                         prefetch(next);
2521                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2522                         skb_mark_not_on_list(skb);
2523                         sk_backlog_rcv(sk, skb);
2524
2525                         cond_resched();
2526
2527                         skb = next;
2528                 } while (skb != NULL);
2529
2530                 spin_lock_bh(&sk->sk_lock.slock);
2531         }
2532
2533         /*
2534          * Doing the zeroing here guarantee we can not loop forever
2535          * while a wild producer attempts to flood us.
2536          */
2537         sk->sk_backlog.len = 0;
2538 }
2539
2540 void __sk_flush_backlog(struct sock *sk)
2541 {
2542         spin_lock_bh(&sk->sk_lock.slock);
2543         __release_sock(sk);
2544         spin_unlock_bh(&sk->sk_lock.slock);
2545 }
2546
2547 /**
2548  * sk_wait_data - wait for data to arrive at sk_receive_queue
2549  * @sk:    sock to wait on
2550  * @timeo: for how long
2551  * @skb:   last skb seen on sk_receive_queue
2552  *
2553  * Now socket state including sk->sk_err is changed only under lock,
2554  * hence we may omit checks after joining wait queue.
2555  * We check receive queue before schedule() only as optimization;
2556  * it is very likely that release_sock() added new data.
2557  */
2558 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2559 {
2560         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2561         int rc;
2562
2563         add_wait_queue(sk_sleep(sk), &wait);
2564         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2565         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2566         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2567         remove_wait_queue(sk_sleep(sk), &wait);
2568         return rc;
2569 }
2570 EXPORT_SYMBOL(sk_wait_data);
2571
2572 /**
2573  *      __sk_mem_raise_allocated - increase memory_allocated
2574  *      @sk: socket
2575  *      @size: memory size to allocate
2576  *      @amt: pages to allocate
2577  *      @kind: allocation type
2578  *
2579  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2580  */
2581 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2582 {
2583         struct proto *prot = sk->sk_prot;
2584         long allocated = sk_memory_allocated_add(sk, amt);
2585         bool charged = true;
2586
2587         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2588             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2589                 goto suppress_allocation;
2590
2591         /* Under limit. */
2592         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2593                 sk_leave_memory_pressure(sk);
2594                 return 1;
2595         }
2596
2597         /* Under pressure. */
2598         if (allocated > sk_prot_mem_limits(sk, 1))
2599                 sk_enter_memory_pressure(sk);
2600
2601         /* Over hard limit. */
2602         if (allocated > sk_prot_mem_limits(sk, 2))
2603                 goto suppress_allocation;
2604
2605         /* guarantee minimum buffer size under pressure */
2606         if (kind == SK_MEM_RECV) {
2607                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2608                         return 1;
2609
2610         } else { /* SK_MEM_SEND */
2611                 int wmem0 = sk_get_wmem0(sk, prot);
2612
2613                 if (sk->sk_type == SOCK_STREAM) {
2614                         if (sk->sk_wmem_queued < wmem0)
2615                                 return 1;
2616                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2617                                 return 1;
2618                 }
2619         }
2620
2621         if (sk_has_memory_pressure(sk)) {
2622                 u64 alloc;
2623
2624                 if (!sk_under_memory_pressure(sk))
2625                         return 1;
2626                 alloc = sk_sockets_allocated_read_positive(sk);
2627                 if (sk_prot_mem_limits(sk, 2) > alloc *
2628                     sk_mem_pages(sk->sk_wmem_queued +
2629                                  atomic_read(&sk->sk_rmem_alloc) +
2630                                  sk->sk_forward_alloc))
2631                         return 1;
2632         }
2633
2634 suppress_allocation:
2635
2636         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2637                 sk_stream_moderate_sndbuf(sk);
2638
2639                 /* Fail only if socket is _under_ its sndbuf.
2640                  * In this case we cannot block, so that we have to fail.
2641                  */
2642                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2643                         return 1;
2644         }
2645
2646         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2647                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2648
2649         sk_memory_allocated_sub(sk, amt);
2650
2651         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2652                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2653
2654         return 0;
2655 }
2656 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2657
2658 /**
2659  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2660  *      @sk: socket
2661  *      @size: memory size to allocate
2662  *      @kind: allocation type
2663  *
2664  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2665  *      rmem allocation. This function assumes that protocols which have
2666  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2667  */
2668 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2669 {
2670         int ret, amt = sk_mem_pages(size);
2671
2672         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2673         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2674         if (!ret)
2675                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2676         return ret;
2677 }
2678 EXPORT_SYMBOL(__sk_mem_schedule);
2679
2680 /**
2681  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2682  *      @sk: socket
2683  *      @amount: number of quanta
2684  *
2685  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2686  */
2687 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2688 {
2689         sk_memory_allocated_sub(sk, amount);
2690
2691         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2692                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2693
2694         if (sk_under_memory_pressure(sk) &&
2695             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2696                 sk_leave_memory_pressure(sk);
2697 }
2698 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2699
2700 /**
2701  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2702  *      @sk: socket
2703  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2704  */
2705 void __sk_mem_reclaim(struct sock *sk, int amount)
2706 {
2707         amount >>= SK_MEM_QUANTUM_SHIFT;
2708         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2709         __sk_mem_reduce_allocated(sk, amount);
2710 }
2711 EXPORT_SYMBOL(__sk_mem_reclaim);
2712
2713 int sk_set_peek_off(struct sock *sk, int val)
2714 {
2715         sk->sk_peek_off = val;
2716         return 0;
2717 }
2718 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2719
2720 /*
2721  * Set of default routines for initialising struct proto_ops when
2722  * the protocol does not support a particular function. In certain
2723  * cases where it makes no sense for a protocol to have a "do nothing"
2724  * function, some default processing is provided.
2725  */
2726
2727 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2728 {
2729         return -EOPNOTSUPP;
2730 }
2731 EXPORT_SYMBOL(sock_no_bind);
2732
2733 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2734                     int len, int flags)
2735 {
2736         return -EOPNOTSUPP;
2737 }
2738 EXPORT_SYMBOL(sock_no_connect);
2739
2740 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2741 {
2742         return -EOPNOTSUPP;
2743 }
2744 EXPORT_SYMBOL(sock_no_socketpair);
2745
2746 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2747                    bool kern)
2748 {
2749         return -EOPNOTSUPP;
2750 }
2751 EXPORT_SYMBOL(sock_no_accept);
2752
2753 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2754                     int peer)
2755 {
2756         return -EOPNOTSUPP;
2757 }
2758 EXPORT_SYMBOL(sock_no_getname);
2759
2760 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2761 {
2762         return -EOPNOTSUPP;
2763 }
2764 EXPORT_SYMBOL(sock_no_ioctl);
2765
2766 int sock_no_listen(struct socket *sock, int backlog)
2767 {
2768         return -EOPNOTSUPP;
2769 }
2770 EXPORT_SYMBOL(sock_no_listen);
2771
2772 int sock_no_shutdown(struct socket *sock, int how)
2773 {
2774         return -EOPNOTSUPP;
2775 }
2776 EXPORT_SYMBOL(sock_no_shutdown);
2777
2778 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2779 {
2780         return -EOPNOTSUPP;
2781 }
2782 EXPORT_SYMBOL(sock_no_sendmsg);
2783
2784 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2785 {
2786         return -EOPNOTSUPP;
2787 }
2788 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2789
2790 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2791                     int flags)
2792 {
2793         return -EOPNOTSUPP;
2794 }
2795 EXPORT_SYMBOL(sock_no_recvmsg);
2796
2797 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2798 {
2799         /* Mirror missing mmap method error code */
2800         return -ENODEV;
2801 }
2802 EXPORT_SYMBOL(sock_no_mmap);
2803
2804 /*
2805  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2806  * various sock-based usage counts.
2807  */
2808 void __receive_sock(struct file *file)
2809 {
2810         struct socket *sock;
2811         int error;
2812
2813         /*
2814          * The resulting value of "error" is ignored here since we only
2815          * need to take action when the file is a socket and testing
2816          * "sock" for NULL is sufficient.
2817          */
2818         sock = sock_from_file(file, &error);
2819         if (sock) {
2820                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2821                 sock_update_classid(&sock->sk->sk_cgrp_data);
2822         }
2823 }
2824
2825 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2826 {
2827         ssize_t res;
2828         struct msghdr msg = {.msg_flags = flags};
2829         struct kvec iov;
2830         char *kaddr = kmap(page);
2831         iov.iov_base = kaddr + offset;
2832         iov.iov_len = size;
2833         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2834         kunmap(page);
2835         return res;
2836 }
2837 EXPORT_SYMBOL(sock_no_sendpage);
2838
2839 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2840                                 int offset, size_t size, int flags)
2841 {
2842         ssize_t res;
2843         struct msghdr msg = {.msg_flags = flags};
2844         struct kvec iov;
2845         char *kaddr = kmap(page);
2846
2847         iov.iov_base = kaddr + offset;
2848         iov.iov_len = size;
2849         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2850         kunmap(page);
2851         return res;
2852 }
2853 EXPORT_SYMBOL(sock_no_sendpage_locked);
2854
2855 /*
2856  *      Default Socket Callbacks
2857  */
2858
2859 static void sock_def_wakeup(struct sock *sk)
2860 {
2861         struct socket_wq *wq;
2862
2863         rcu_read_lock();
2864         wq = rcu_dereference(sk->sk_wq);
2865         if (skwq_has_sleeper(wq))
2866                 wake_up_interruptible_all(&wq->wait);
2867         rcu_read_unlock();
2868 }
2869
2870 static void sock_def_error_report(struct sock *sk)
2871 {
2872         struct socket_wq *wq;
2873
2874         rcu_read_lock();
2875         wq = rcu_dereference(sk->sk_wq);
2876         if (skwq_has_sleeper(wq))
2877                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2878         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2879         rcu_read_unlock();
2880 }
2881
2882 void sock_def_readable(struct sock *sk)
2883 {
2884         struct socket_wq *wq;
2885
2886         rcu_read_lock();
2887         wq = rcu_dereference(sk->sk_wq);
2888         if (skwq_has_sleeper(wq))
2889                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2890                                                 EPOLLRDNORM | EPOLLRDBAND);
2891         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2892         rcu_read_unlock();
2893 }
2894
2895 static void sock_def_write_space(struct sock *sk)
2896 {
2897         struct socket_wq *wq;
2898
2899         rcu_read_lock();
2900
2901         /* Do not wake up a writer until he can make "significant"
2902          * progress.  --DaveM
2903          */
2904         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2905                 wq = rcu_dereference(sk->sk_wq);
2906                 if (skwq_has_sleeper(wq))
2907                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2908                                                 EPOLLWRNORM | EPOLLWRBAND);
2909
2910                 /* Should agree with poll, otherwise some programs break */
2911                 if (sock_writeable(sk))
2912                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2913         }
2914
2915         rcu_read_unlock();
2916 }
2917
2918 static void sock_def_destruct(struct sock *sk)
2919 {
2920 }
2921
2922 void sk_send_sigurg(struct sock *sk)
2923 {
2924         if (sk->sk_socket && sk->sk_socket->file)
2925                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2926                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2927 }
2928 EXPORT_SYMBOL(sk_send_sigurg);
2929
2930 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2931                     unsigned long expires)
2932 {
2933         if (!mod_timer(timer, expires))
2934                 sock_hold(sk);
2935 }
2936 EXPORT_SYMBOL(sk_reset_timer);
2937
2938 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2939 {
2940         if (del_timer(timer))
2941                 __sock_put(sk);
2942 }
2943 EXPORT_SYMBOL(sk_stop_timer);
2944
2945 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2946 {
2947         if (del_timer_sync(timer))
2948                 __sock_put(sk);
2949 }
2950 EXPORT_SYMBOL(sk_stop_timer_sync);
2951
2952 void sock_init_data(struct socket *sock, struct sock *sk)
2953 {
2954         sk_init_common(sk);
2955         sk->sk_send_head        =       NULL;
2956
2957         timer_setup(&sk->sk_timer, NULL, 0);
2958
2959         sk->sk_allocation       =       GFP_KERNEL;
2960         sk->sk_rcvbuf           =       sysctl_rmem_default;
2961         sk->sk_sndbuf           =       sysctl_wmem_default;
2962         sk->sk_state            =       TCP_CLOSE;
2963         sk_set_socket(sk, sock);
2964
2965         sock_set_flag(sk, SOCK_ZAPPED);
2966
2967         if (sock) {
2968                 sk->sk_type     =       sock->type;
2969                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2970                 sock->sk        =       sk;
2971                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2972         } else {
2973                 RCU_INIT_POINTER(sk->sk_wq, NULL);
2974                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2975         }
2976
2977         rwlock_init(&sk->sk_callback_lock);
2978         if (sk->sk_kern_sock)
2979                 lockdep_set_class_and_name(
2980                         &sk->sk_callback_lock,
2981                         af_kern_callback_keys + sk->sk_family,
2982                         af_family_kern_clock_key_strings[sk->sk_family]);
2983         else
2984                 lockdep_set_class_and_name(
2985                         &sk->sk_callback_lock,
2986                         af_callback_keys + sk->sk_family,
2987                         af_family_clock_key_strings[sk->sk_family]);
2988
2989         sk->sk_state_change     =       sock_def_wakeup;
2990         sk->sk_data_ready       =       sock_def_readable;
2991         sk->sk_write_space      =       sock_def_write_space;
2992         sk->sk_error_report     =       sock_def_error_report;
2993         sk->sk_destruct         =       sock_def_destruct;
2994
2995         sk->sk_frag.page        =       NULL;
2996         sk->sk_frag.offset      =       0;
2997         sk->sk_peek_off         =       -1;
2998
2999         sk->sk_peer_pid         =       NULL;
3000         sk->sk_peer_cred        =       NULL;
3001         sk->sk_write_pending    =       0;
3002         sk->sk_rcvlowat         =       1;
3003         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3004         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3005
3006         sk->sk_stamp = SK_DEFAULT_STAMP;
3007 #if BITS_PER_LONG==32
3008         seqlock_init(&sk->sk_stamp_seq);
3009 #endif
3010         atomic_set(&sk->sk_zckey, 0);
3011
3012 #ifdef CONFIG_NET_RX_BUSY_POLL
3013         sk->sk_napi_id          =       0;
3014         sk->sk_ll_usec          =       sysctl_net_busy_read;
3015 #endif
3016
3017         sk->sk_max_pacing_rate = ~0UL;
3018         sk->sk_pacing_rate = ~0UL;
3019         WRITE_ONCE(sk->sk_pacing_shift, 10);
3020         sk->sk_incoming_cpu = -1;
3021
3022         sk_rx_queue_clear(sk);
3023         /*
3024          * Before updating sk_refcnt, we must commit prior changes to memory
3025          * (Documentation/RCU/rculist_nulls.rst for details)
3026          */
3027         smp_wmb();
3028         refcount_set(&sk->sk_refcnt, 1);
3029         atomic_set(&sk->sk_drops, 0);
3030 }
3031 EXPORT_SYMBOL(sock_init_data);
3032
3033 void lock_sock_nested(struct sock *sk, int subclass)
3034 {
3035         might_sleep();
3036         spin_lock_bh(&sk->sk_lock.slock);
3037         if (sk->sk_lock.owned)
3038                 __lock_sock(sk);
3039         sk->sk_lock.owned = 1;
3040         spin_unlock(&sk->sk_lock.slock);
3041         /*
3042          * The sk_lock has mutex_lock() semantics here:
3043          */
3044         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3045         local_bh_enable();
3046 }
3047 EXPORT_SYMBOL(lock_sock_nested);
3048
3049 void release_sock(struct sock *sk)
3050 {
3051         spin_lock_bh(&sk->sk_lock.slock);
3052         if (sk->sk_backlog.tail)
3053                 __release_sock(sk);
3054
3055         /* Warning : release_cb() might need to release sk ownership,
3056          * ie call sock_release_ownership(sk) before us.
3057          */
3058         if (sk->sk_prot->release_cb)
3059                 sk->sk_prot->release_cb(sk);
3060
3061         sock_release_ownership(sk);
3062         if (waitqueue_active(&sk->sk_lock.wq))
3063                 wake_up(&sk->sk_lock.wq);
3064         spin_unlock_bh(&sk->sk_lock.slock);
3065 }
3066 EXPORT_SYMBOL(release_sock);
3067
3068 /**
3069  * lock_sock_fast - fast version of lock_sock
3070  * @sk: socket
3071  *
3072  * This version should be used for very small section, where process wont block
3073  * return false if fast path is taken:
3074  *
3075  *   sk_lock.slock locked, owned = 0, BH disabled
3076  *
3077  * return true if slow path is taken:
3078  *
3079  *   sk_lock.slock unlocked, owned = 1, BH enabled
3080  */
3081 bool lock_sock_fast(struct sock *sk)
3082 {
3083         might_sleep();
3084         spin_lock_bh(&sk->sk_lock.slock);
3085
3086         if (!sk->sk_lock.owned)
3087                 /*
3088                  * Note : We must disable BH
3089                  */
3090                 return false;
3091
3092         __lock_sock(sk);
3093         sk->sk_lock.owned = 1;
3094         spin_unlock(&sk->sk_lock.slock);
3095         /*
3096          * The sk_lock has mutex_lock() semantics here:
3097          */
3098         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3099         local_bh_enable();
3100         return true;
3101 }
3102 EXPORT_SYMBOL(lock_sock_fast);
3103
3104 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3105                    bool timeval, bool time32)
3106 {
3107         struct sock *sk = sock->sk;
3108         struct timespec64 ts;
3109
3110         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3111         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3112         if (ts.tv_sec == -1)
3113                 return -ENOENT;
3114         if (ts.tv_sec == 0) {
3115                 ktime_t kt = ktime_get_real();
3116                 sock_write_timestamp(sk, kt);
3117                 ts = ktime_to_timespec64(kt);
3118         }
3119
3120         if (timeval)
3121                 ts.tv_nsec /= 1000;
3122
3123 #ifdef CONFIG_COMPAT_32BIT_TIME
3124         if (time32)
3125                 return put_old_timespec32(&ts, userstamp);
3126 #endif
3127 #ifdef CONFIG_SPARC64
3128         /* beware of padding in sparc64 timeval */
3129         if (timeval && !in_compat_syscall()) {
3130                 struct __kernel_old_timeval __user tv = {
3131                         .tv_sec = ts.tv_sec,
3132                         .tv_usec = ts.tv_nsec,
3133                 };
3134                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3135                         return -EFAULT;
3136                 return 0;
3137         }
3138 #endif
3139         return put_timespec64(&ts, userstamp);
3140 }
3141 EXPORT_SYMBOL(sock_gettstamp);
3142
3143 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3144 {
3145         if (!sock_flag(sk, flag)) {
3146                 unsigned long previous_flags = sk->sk_flags;
3147
3148                 sock_set_flag(sk, flag);
3149                 /*
3150                  * we just set one of the two flags which require net
3151                  * time stamping, but time stamping might have been on
3152                  * already because of the other one
3153                  */
3154                 if (sock_needs_netstamp(sk) &&
3155                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3156                         net_enable_timestamp();
3157         }
3158 }
3159
3160 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3161                        int level, int type)
3162 {
3163         struct sock_exterr_skb *serr;
3164         struct sk_buff *skb;
3165         int copied, err;
3166
3167         err = -EAGAIN;
3168         skb = sock_dequeue_err_skb(sk);
3169         if (skb == NULL)
3170                 goto out;
3171
3172         copied = skb->len;
3173         if (copied > len) {
3174                 msg->msg_flags |= MSG_TRUNC;
3175                 copied = len;
3176         }
3177         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3178         if (err)
3179                 goto out_free_skb;
3180
3181         sock_recv_timestamp(msg, sk, skb);
3182
3183         serr = SKB_EXT_ERR(skb);
3184         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3185
3186         msg->msg_flags |= MSG_ERRQUEUE;
3187         err = copied;
3188
3189 out_free_skb:
3190         kfree_skb(skb);
3191 out:
3192         return err;
3193 }
3194 EXPORT_SYMBOL(sock_recv_errqueue);
3195
3196 /*
3197  *      Get a socket option on an socket.
3198  *
3199  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3200  *      asynchronous errors should be reported by getsockopt. We assume
3201  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3202  */
3203 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3204                            char __user *optval, int __user *optlen)
3205 {
3206         struct sock *sk = sock->sk;
3207
3208         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3209 }
3210 EXPORT_SYMBOL(sock_common_getsockopt);
3211
3212 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3213                         int flags)
3214 {
3215         struct sock *sk = sock->sk;
3216         int addr_len = 0;
3217         int err;
3218
3219         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3220                                    flags & ~MSG_DONTWAIT, &addr_len);
3221         if (err >= 0)
3222                 msg->msg_namelen = addr_len;
3223         return err;
3224 }
3225 EXPORT_SYMBOL(sock_common_recvmsg);
3226
3227 /*
3228  *      Set socket options on an inet socket.
3229  */
3230 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3231                            sockptr_t optval, unsigned int optlen)
3232 {
3233         struct sock *sk = sock->sk;
3234
3235         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3236 }
3237 EXPORT_SYMBOL(sock_common_setsockopt);
3238
3239 void sk_common_release(struct sock *sk)
3240 {
3241         if (sk->sk_prot->destroy)
3242                 sk->sk_prot->destroy(sk);
3243
3244         /*
3245          * Observation: when sk_common_release is called, processes have
3246          * no access to socket. But net still has.
3247          * Step one, detach it from networking:
3248          *
3249          * A. Remove from hash tables.
3250          */
3251
3252         sk->sk_prot->unhash(sk);
3253
3254         /*
3255          * In this point socket cannot receive new packets, but it is possible
3256          * that some packets are in flight because some CPU runs receiver and
3257          * did hash table lookup before we unhashed socket. They will achieve
3258          * receive queue and will be purged by socket destructor.
3259          *
3260          * Also we still have packets pending on receive queue and probably,
3261          * our own packets waiting in device queues. sock_destroy will drain
3262          * receive queue, but transmitted packets will delay socket destruction
3263          * until the last reference will be released.
3264          */
3265
3266         sock_orphan(sk);
3267
3268         xfrm_sk_free_policy(sk);
3269
3270         sk_refcnt_debug_release(sk);
3271
3272         sock_put(sk);
3273 }
3274 EXPORT_SYMBOL(sk_common_release);
3275
3276 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3277 {
3278         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3279
3280         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3281         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3282         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3283         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3284         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3285         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3286         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3287         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3288         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3289 }
3290
3291 #ifdef CONFIG_PROC_FS
3292 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3293 struct prot_inuse {
3294         int val[PROTO_INUSE_NR];
3295 };
3296
3297 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3298
3299 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3300 {
3301         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3302 }
3303 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3304
3305 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3306 {
3307         int cpu, idx = prot->inuse_idx;
3308         int res = 0;
3309
3310         for_each_possible_cpu(cpu)
3311                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3312
3313         return res >= 0 ? res : 0;
3314 }
3315 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3316
3317 static void sock_inuse_add(struct net *net, int val)
3318 {
3319         this_cpu_add(*net->core.sock_inuse, val);
3320 }
3321
3322 int sock_inuse_get(struct net *net)
3323 {
3324         int cpu, res = 0;
3325
3326         for_each_possible_cpu(cpu)
3327                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3328
3329         return res;
3330 }
3331
3332 EXPORT_SYMBOL_GPL(sock_inuse_get);
3333
3334 static int __net_init sock_inuse_init_net(struct net *net)
3335 {
3336         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3337         if (net->core.prot_inuse == NULL)
3338                 return -ENOMEM;
3339
3340         net->core.sock_inuse = alloc_percpu(int);
3341         if (net->core.sock_inuse == NULL)
3342                 goto out;
3343
3344         return 0;
3345
3346 out:
3347         free_percpu(net->core.prot_inuse);
3348         return -ENOMEM;
3349 }
3350
3351 static void __net_exit sock_inuse_exit_net(struct net *net)
3352 {
3353         free_percpu(net->core.prot_inuse);
3354         free_percpu(net->core.sock_inuse);
3355 }
3356
3357 static struct pernet_operations net_inuse_ops = {
3358         .init = sock_inuse_init_net,
3359         .exit = sock_inuse_exit_net,
3360 };
3361
3362 static __init int net_inuse_init(void)
3363 {
3364         if (register_pernet_subsys(&net_inuse_ops))
3365                 panic("Cannot initialize net inuse counters");
3366
3367         return 0;
3368 }
3369
3370 core_initcall(net_inuse_init);
3371
3372 static int assign_proto_idx(struct proto *prot)
3373 {
3374         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3375
3376         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3377                 pr_err("PROTO_INUSE_NR exhausted\n");
3378                 return -ENOSPC;
3379         }
3380
3381         set_bit(prot->inuse_idx, proto_inuse_idx);
3382         return 0;
3383 }
3384
3385 static void release_proto_idx(struct proto *prot)
3386 {
3387         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3388                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3389 }
3390 #else
3391 static inline int assign_proto_idx(struct proto *prot)
3392 {
3393         return 0;
3394 }
3395
3396 static inline void release_proto_idx(struct proto *prot)
3397 {
3398 }
3399
3400 static void sock_inuse_add(struct net *net, int val)
3401 {
3402 }
3403 #endif
3404
3405 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3406 {
3407         if (!twsk_prot)
3408                 return;
3409         kfree(twsk_prot->twsk_slab_name);
3410         twsk_prot->twsk_slab_name = NULL;
3411         kmem_cache_destroy(twsk_prot->twsk_slab);
3412         twsk_prot->twsk_slab = NULL;
3413 }
3414
3415 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3416 {
3417         if (!rsk_prot)
3418                 return;
3419         kfree(rsk_prot->slab_name);
3420         rsk_prot->slab_name = NULL;
3421         kmem_cache_destroy(rsk_prot->slab);
3422         rsk_prot->slab = NULL;
3423 }
3424
3425 static int req_prot_init(const struct proto *prot)
3426 {
3427         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3428
3429         if (!rsk_prot)
3430                 return 0;
3431
3432         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3433                                         prot->name);
3434         if (!rsk_prot->slab_name)
3435                 return -ENOMEM;
3436
3437         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3438                                            rsk_prot->obj_size, 0,
3439                                            SLAB_ACCOUNT | prot->slab_flags,
3440                                            NULL);
3441
3442         if (!rsk_prot->slab) {
3443                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3444                         prot->name);
3445                 return -ENOMEM;
3446         }
3447         return 0;
3448 }
3449
3450 int proto_register(struct proto *prot, int alloc_slab)
3451 {
3452         int ret = -ENOBUFS;
3453
3454         if (alloc_slab) {
3455                 prot->slab = kmem_cache_create_usercopy(prot->name,
3456                                         prot->obj_size, 0,
3457                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3458                                         prot->slab_flags,
3459                                         prot->useroffset, prot->usersize,
3460                                         NULL);
3461
3462                 if (prot->slab == NULL) {
3463                         pr_crit("%s: Can't create sock SLAB cache!\n",
3464                                 prot->name);
3465                         goto out;
3466                 }
3467
3468                 if (req_prot_init(prot))
3469                         goto out_free_request_sock_slab;
3470
3471                 if (prot->twsk_prot != NULL) {
3472                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3473
3474                         if (prot->twsk_prot->twsk_slab_name == NULL)
3475                                 goto out_free_request_sock_slab;
3476
3477                         prot->twsk_prot->twsk_slab =
3478                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3479                                                   prot->twsk_prot->twsk_obj_size,
3480                                                   0,
3481                                                   SLAB_ACCOUNT |
3482                                                   prot->slab_flags,
3483                                                   NULL);
3484                         if (prot->twsk_prot->twsk_slab == NULL)
3485                                 goto out_free_timewait_sock_slab;
3486                 }
3487         }
3488
3489         mutex_lock(&proto_list_mutex);
3490         ret = assign_proto_idx(prot);
3491         if (ret) {
3492                 mutex_unlock(&proto_list_mutex);
3493                 goto out_free_timewait_sock_slab;
3494         }
3495         list_add(&prot->node, &proto_list);
3496         mutex_unlock(&proto_list_mutex);
3497         return ret;
3498
3499 out_free_timewait_sock_slab:
3500         if (alloc_slab && prot->twsk_prot)
3501                 tw_prot_cleanup(prot->twsk_prot);
3502 out_free_request_sock_slab:
3503         if (alloc_slab) {
3504                 req_prot_cleanup(prot->rsk_prot);
3505
3506                 kmem_cache_destroy(prot->slab);
3507                 prot->slab = NULL;
3508         }
3509 out:
3510         return ret;
3511 }
3512 EXPORT_SYMBOL(proto_register);
3513
3514 void proto_unregister(struct proto *prot)
3515 {
3516         mutex_lock(&proto_list_mutex);
3517         release_proto_idx(prot);
3518         list_del(&prot->node);
3519         mutex_unlock(&proto_list_mutex);
3520
3521         kmem_cache_destroy(prot->slab);
3522         prot->slab = NULL;
3523
3524         req_prot_cleanup(prot->rsk_prot);
3525         tw_prot_cleanup(prot->twsk_prot);
3526 }
3527 EXPORT_SYMBOL(proto_unregister);
3528
3529 int sock_load_diag_module(int family, int protocol)
3530 {
3531         if (!protocol) {
3532                 if (!sock_is_registered(family))
3533                         return -ENOENT;
3534
3535                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3536                                       NETLINK_SOCK_DIAG, family);
3537         }
3538
3539 #ifdef CONFIG_INET
3540         if (family == AF_INET &&
3541             protocol != IPPROTO_RAW &&
3542             protocol < MAX_INET_PROTOS &&
3543             !rcu_access_pointer(inet_protos[protocol]))
3544                 return -ENOENT;
3545 #endif
3546
3547         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3548                               NETLINK_SOCK_DIAG, family, protocol);
3549 }
3550 EXPORT_SYMBOL(sock_load_diag_module);
3551
3552 #ifdef CONFIG_PROC_FS
3553 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3554         __acquires(proto_list_mutex)
3555 {
3556         mutex_lock(&proto_list_mutex);
3557         return seq_list_start_head(&proto_list, *pos);
3558 }
3559
3560 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3561 {
3562         return seq_list_next(v, &proto_list, pos);
3563 }
3564
3565 static void proto_seq_stop(struct seq_file *seq, void *v)
3566         __releases(proto_list_mutex)
3567 {
3568         mutex_unlock(&proto_list_mutex);
3569 }
3570
3571 static char proto_method_implemented(const void *method)
3572 {
3573         return method == NULL ? 'n' : 'y';
3574 }
3575 static long sock_prot_memory_allocated(struct proto *proto)
3576 {
3577         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3578 }
3579
3580 static const char *sock_prot_memory_pressure(struct proto *proto)
3581 {
3582         return proto->memory_pressure != NULL ?
3583         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3584 }
3585
3586 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3587 {
3588
3589         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3590                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3591                    proto->name,
3592                    proto->obj_size,
3593                    sock_prot_inuse_get(seq_file_net(seq), proto),
3594                    sock_prot_memory_allocated(proto),
3595                    sock_prot_memory_pressure(proto),
3596                    proto->max_header,
3597                    proto->slab == NULL ? "no" : "yes",
3598                    module_name(proto->owner),
3599                    proto_method_implemented(proto->close),
3600                    proto_method_implemented(proto->connect),
3601                    proto_method_implemented(proto->disconnect),
3602                    proto_method_implemented(proto->accept),
3603                    proto_method_implemented(proto->ioctl),
3604                    proto_method_implemented(proto->init),
3605                    proto_method_implemented(proto->destroy),
3606                    proto_method_implemented(proto->shutdown),
3607                    proto_method_implemented(proto->setsockopt),
3608                    proto_method_implemented(proto->getsockopt),
3609                    proto_method_implemented(proto->sendmsg),
3610                    proto_method_implemented(proto->recvmsg),
3611                    proto_method_implemented(proto->sendpage),
3612                    proto_method_implemented(proto->bind),
3613                    proto_method_implemented(proto->backlog_rcv),
3614                    proto_method_implemented(proto->hash),
3615                    proto_method_implemented(proto->unhash),
3616                    proto_method_implemented(proto->get_port),
3617                    proto_method_implemented(proto->enter_memory_pressure));
3618 }
3619
3620 static int proto_seq_show(struct seq_file *seq, void *v)
3621 {
3622         if (v == &proto_list)
3623                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3624                            "protocol",
3625                            "size",
3626                            "sockets",
3627                            "memory",
3628                            "press",
3629                            "maxhdr",
3630                            "slab",
3631                            "module",
3632                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3633         else
3634                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3635         return 0;
3636 }
3637
3638 static const struct seq_operations proto_seq_ops = {
3639         .start  = proto_seq_start,
3640         .next   = proto_seq_next,
3641         .stop   = proto_seq_stop,
3642         .show   = proto_seq_show,
3643 };
3644
3645 static __net_init int proto_init_net(struct net *net)
3646 {
3647         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3648                         sizeof(struct seq_net_private)))
3649                 return -ENOMEM;
3650
3651         return 0;
3652 }
3653
3654 static __net_exit void proto_exit_net(struct net *net)
3655 {
3656         remove_proc_entry("protocols", net->proc_net);
3657 }
3658
3659
3660 static __net_initdata struct pernet_operations proto_net_ops = {
3661         .init = proto_init_net,
3662         .exit = proto_exit_net,
3663 };
3664
3665 static int __init proto_init(void)
3666 {
3667         return register_pernet_subsys(&proto_net_ops);
3668 }
3669
3670 subsys_initcall(proto_init);
3671
3672 #endif /* PROC_FS */
3673
3674 #ifdef CONFIG_NET_RX_BUSY_POLL
3675 bool sk_busy_loop_end(void *p, unsigned long start_time)
3676 {
3677         struct sock *sk = p;
3678
3679         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3680                sk_busy_loop_timeout(sk, start_time);
3681 }
3682 EXPORT_SYMBOL(sk_busy_loop_end);
3683 #endif /* CONFIG_NET_RX_BUSY_POLL */
3684
3685 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3686 {
3687         if (!sk->sk_prot->bind_add)
3688                 return -EOPNOTSUPP;
3689         return sk->sk_prot->bind_add(sk, addr, addr_len);
3690 }
3691 EXPORT_SYMBOL(sock_bind_add);