net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117
 118 #include <linux/uaccess.h>
 119
 120 #include <linux/netdevice.h>
 121 #include <net/protocol.h>
 122 #include <linux/skbuff.h>
 123 #include <net/net_namespace.h>
 124 #include <net/request_sock.h>
 125 #include <net/sock.h>
 126 #include <linux/net_tstamp.h>
 127 #include <net/xfrm.h>
 128 #include <linux/ipsec.h>
 129 #include <net/cls_cgroup.h>
 130 #include <net/netprio_cgroup.h>
 131 #include <linux/sock_diag.h>
 132
 133 #include <linux/filter.h>
 134 #include <net/sock_reuseport.h>
 135 #include <net/bpf_sk_storage.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #include <net/tcp.h>
 140 #include <net/busy_poll.h>
 141
 142 #include <linux/ethtool.h>
 143
 144 static DEFINE_MUTEX(proto_list_mutex);
 145 static LIST_HEAD(proto_list);
 146
 147 static void sock_inuse_add(struct net *net, int val);
 148
 149 /**
 150  * sk_ns_capable - General socket capability test
 151  * @sk: Socket to use a capability on or through
 152  * @user_ns: The user namespace of the capability to use
 153  * @cap: The capability to use
 154  *
 155  * Test to see if the opener of the socket had when the socket was
 156  * created and the current process has the capability @cap in the user
 157  * namespace @user_ns.
 158  */
 159 bool sk_ns_capable(const struct sock *sk,
 160                    struct user_namespace *user_ns, int cap)
 161 {
 162         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 163                 ns_capable(user_ns, cap);
 164 }
 165 EXPORT_SYMBOL(sk_ns_capable);
 166
 167 /**
 168  * sk_capable - Socket global capability test
 169  * @sk: Socket to use a capability on or through
 170  * @cap: The global capability to use
 171  *
 172  * Test to see if the opener of the socket had when the socket was
 173  * created and the current process has the capability @cap in all user
 174  * namespaces.
 175  */
 176 bool sk_capable(const struct sock *sk, int cap)
 177 {
 178         return sk_ns_capable(sk, &init_user_ns, cap);
 179 }
 180 EXPORT_SYMBOL(sk_capable);
 181
 182 /**
 183  * sk_net_capable - Network namespace socket capability test
 184  * @sk: Socket to use a capability on or through
 185  * @cap: The capability to use
 186  *
 187  * Test to see if the opener of the socket had when the socket was created
 188  * and the current process has the capability @cap over the network namespace
 189  * the socket is a member of.
 190  */
 191 bool sk_net_capable(const struct sock *sk, int cap)
 192 {
 193         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 194 }
 195 EXPORT_SYMBOL(sk_net_capable);
 196
 197 /*
 198  * Each address family might have different locking rules, so we have
 199  * one slock key per address family and separate keys for internal and
 200  * userspace sockets.
 201  */
 202 static struct lock_class_key af_family_keys[AF_MAX];
 203 static struct lock_class_key af_family_kern_keys[AF_MAX];
 204 static struct lock_class_key af_family_slock_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 206
 207 /*
 208  * Make lock validator output more readable. (we pre-construct these
 209  * strings build-time, so that runtime initialization of socket
 210  * locks is fast):
 211  */
 212
 213 #define _sock_locks(x)                                            \
 214   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 215   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 216   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 217   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 218   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 219   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 220   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 221   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 222   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 223   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 224   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 225   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 226   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 227   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 228   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 229   x "AF_MCTP"  , \
 230   x "AF_MAX"
 231
 232 static const char *const af_family_key_strings[AF_MAX+1] = {
 233         _sock_locks("sk_lock-")
 234 };
 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 236         _sock_locks("slock-")
 237 };
 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 239         _sock_locks("clock-")
 240 };
 241
 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-sk_lock-")
 244 };
 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-slock-")
 247 };
 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("k-clock-")
 250 };
 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 252         _sock_locks("rlock-")
 253 };
 254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 255         _sock_locks("wlock-")
 256 };
 257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 258         _sock_locks("elock-")
 259 };
 260
 261 /*
 262  * sk_callback_lock and sk queues locking rules are per-address-family,
 263  * so split the lock classes by using a per-AF key:
 264  */
 265 static struct lock_class_key af_callback_keys[AF_MAX];
 266 static struct lock_class_key af_rlock_keys[AF_MAX];
 267 static struct lock_class_key af_wlock_keys[AF_MAX];
 268 static struct lock_class_key af_elock_keys[AF_MAX];
 269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 270
 271 /* Run time adjustable parameters. */
 272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 273 EXPORT_SYMBOL(sysctl_wmem_max);
 274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 275 EXPORT_SYMBOL(sysctl_rmem_max);
 276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 278
 279 /* Maximal space eaten by iovec or ancillary data plus some space */
 280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 281 EXPORT_SYMBOL(sysctl_optmem_max);
 282
 283 int sysctl_tstamp_allow_data __read_mostly = 1;
 284
 285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 287
 288 /**
 289  * sk_set_memalloc - sets %SOCK_MEMALLOC
 290  * @sk: socket to set it on
 291  *
 292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 293  * It's the responsibility of the admin to adjust min_free_kbytes
 294  * to meet the requirements
 295  */
 296 void sk_set_memalloc(struct sock *sk)
 297 {
 298         sock_set_flag(sk, SOCK_MEMALLOC);
 299         sk->sk_allocation |= __GFP_MEMALLOC;
 300         static_branch_inc(&memalloc_socks_key);
 301 }
 302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 303
 304 void sk_clear_memalloc(struct sock *sk)
 305 {
 306         sock_reset_flag(sk, SOCK_MEMALLOC);
 307         sk->sk_allocation &= ~__GFP_MEMALLOC;
 308         static_branch_dec(&memalloc_socks_key);
 309
 310         /*
 311          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 312          * progress of swapping. SOCK_MEMALLOC may be cleared while
 313          * it has rmem allocations due to the last swapfile being deactivated
 314          * but there is a risk that the socket is unusable due to exceeding
 315          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 316          */
 317         sk_mem_reclaim(sk);
 318 }
 319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 320
 321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 322 {
 323         int ret;
 324         unsigned int noreclaim_flag;
 325
 326         /* these should have been dropped before queueing */
 327         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 328
 329         noreclaim_flag = memalloc_noreclaim_save();
 330         ret = sk->sk_backlog_rcv(sk, skb);
 331         memalloc_noreclaim_restore(noreclaim_flag);
 332
 333         return ret;
 334 }
 335 EXPORT_SYMBOL(__sk_backlog_rcv);
 336
 337 void sk_error_report(struct sock *sk)
 338 {
 339         sk->sk_error_report(sk);
 340
 341         switch (sk->sk_family) {
 342         case AF_INET:
 343                 fallthrough;
 344         case AF_INET6:
 345                 trace_inet_sk_error_report(sk);
 346                 break;
 347         default:
 348                 break;
 349         }
 350 }
 351 EXPORT_SYMBOL(sk_error_report);
 352
 353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 354 {
 355         struct __kernel_sock_timeval tv;
 356
 357         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 358                 tv.tv_sec = 0;
 359                 tv.tv_usec = 0;
 360         } else {
 361                 tv.tv_sec = timeo / HZ;
 362                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 363         }
 364
 365         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 366                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 367                 *(struct old_timeval32 *)optval = tv32;
 368                 return sizeof(tv32);
 369         }
 370
 371         if (old_timeval) {
 372                 struct __kernel_old_timeval old_tv;
 373                 old_tv.tv_sec = tv.tv_sec;
 374                 old_tv.tv_usec = tv.tv_usec;
 375                 *(struct __kernel_old_timeval *)optval = old_tv;
 376                 return sizeof(old_tv);
 377         }
 378
 379         *(struct __kernel_sock_timeval *)optval = tv;
 380         return sizeof(tv);
 381 }
 382
 383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 384                             bool old_timeval)
 385 {
 386         struct __kernel_sock_timeval tv;
 387
 388         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 389                 struct old_timeval32 tv32;
 390
 391                 if (optlen < sizeof(tv32))
 392                         return -EINVAL;
 393
 394                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 395                         return -EFAULT;
 396                 tv.tv_sec = tv32.tv_sec;
 397                 tv.tv_usec = tv32.tv_usec;
 398         } else if (old_timeval) {
 399                 struct __kernel_old_timeval old_tv;
 400
 401                 if (optlen < sizeof(old_tv))
 402                         return -EINVAL;
 403                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 404                         return -EFAULT;
 405                 tv.tv_sec = old_tv.tv_sec;
 406                 tv.tv_usec = old_tv.tv_usec;
 407         } else {
 408                 if (optlen < sizeof(tv))
 409                         return -EINVAL;
 410                 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 411                         return -EFAULT;
 412         }
 413         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 414                 return -EDOM;
 415
 416         if (tv.tv_sec < 0) {
 417                 static int warned __read_mostly;
 418
 419                 *timeo_p = 0;
 420                 if (warned < 10 && net_ratelimit()) {
 421                         warned++;
 422                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 423                                 __func__, current->comm, task_pid_nr(current));
 424                 }
 425                 return 0;
 426         }
 427         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 428         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 429                 return 0;
 430         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 431                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 432         return 0;
 433 }
 434
 435 static bool sock_needs_netstamp(const struct sock *sk)
 436 {
 437         switch (sk->sk_family) {
 438         case AF_UNSPEC:
 439         case AF_UNIX:
 440                 return false;
 441         default:
 442                 return true;
 443         }
 444 }
 445
 446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 447 {
 448         if (sk->sk_flags & flags) {
 449                 sk->sk_flags &= ~flags;
 450                 if (sock_needs_netstamp(sk) &&
 451                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 452                         net_disable_timestamp();
 453         }
 454 }
 455
 456
 457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 458 {
 459         unsigned long flags;
 460         struct sk_buff_head *list = &sk->sk_receive_queue;
 461
 462         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 463                 atomic_inc(&sk->sk_drops);
 464                 trace_sock_rcvqueue_full(sk, skb);
 465                 return -ENOMEM;
 466         }
 467
 468         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 469                 atomic_inc(&sk->sk_drops);
 470                 return -ENOBUFS;
 471         }
 472
 473         skb->dev = NULL;
 474         skb_set_owner_r(skb, sk);
 475
 476         /* we escape from rcu protected region, make sure we dont leak
 477          * a norefcounted dst
 478          */
 479         skb_dst_force(skb);
 480
 481         spin_lock_irqsave(&list->lock, flags);
 482         sock_skb_set_dropcount(sk, skb);
 483         __skb_queue_tail(list, skb);
 484         spin_unlock_irqrestore(&list->lock, flags);
 485
 486         if (!sock_flag(sk, SOCK_DEAD))
 487                 sk->sk_data_ready(sk);
 488         return 0;
 489 }
 490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 491
 492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 493 {
 494         int err;
 495
 496         err = sk_filter(sk, skb);
 497         if (err)
 498                 return err;
 499
 500         return __sock_queue_rcv_skb(sk, skb);
 501 }
 502 EXPORT_SYMBOL(sock_queue_rcv_skb);
 503
 504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 505                      const int nested, unsigned int trim_cap, bool refcounted)
 506 {
 507         int rc = NET_RX_SUCCESS;
 508
 509         if (sk_filter_trim_cap(sk, skb, trim_cap))
 510                 goto discard_and_relse;
 511
 512         skb->dev = NULL;
 513
 514         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 515                 atomic_inc(&sk->sk_drops);
 516                 goto discard_and_relse;
 517         }
 518         if (nested)
 519                 bh_lock_sock_nested(sk);
 520         else
 521                 bh_lock_sock(sk);
 522         if (!sock_owned_by_user(sk)) {
 523                 /*
 524                  * trylock + unlock semantics:
 525                  */
 526                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 527
 528                 rc = sk_backlog_rcv(sk, skb);
 529
 530                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 531         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 532                 bh_unlock_sock(sk);
 533                 atomic_inc(&sk->sk_drops);
 534                 goto discard_and_relse;
 535         }
 536
 537         bh_unlock_sock(sk);
 538 out:
 539         if (refcounted)
 540                 sock_put(sk);
 541         return rc;
 542 discard_and_relse:
 543         kfree_skb(skb);
 544         goto out;
 545 }
 546 EXPORT_SYMBOL(__sk_receive_skb);
 547
 548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 549                                                           u32));
 550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 551                                                            u32));
 552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 553 {
 554         struct dst_entry *dst = __sk_dst_get(sk);
 555
 556         if (dst && dst->obsolete &&
 557             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 558                                dst, cookie) == NULL) {
 559                 sk_tx_queue_clear(sk);
 560                 sk->sk_dst_pending_confirm = 0;
 561                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 562                 dst_release(dst);
 563                 return NULL;
 564         }
 565
 566         return dst;
 567 }
 568 EXPORT_SYMBOL(__sk_dst_check);
 569
 570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 571 {
 572         struct dst_entry *dst = sk_dst_get(sk);
 573
 574         if (dst && dst->obsolete &&
 575             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 576                                dst, cookie) == NULL) {
 577                 sk_dst_reset(sk);
 578                 dst_release(dst);
 579                 return NULL;
 580         }
 581
 582         return dst;
 583 }
 584 EXPORT_SYMBOL(sk_dst_check);
 585
 586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 587 {
 588         int ret = -ENOPROTOOPT;
 589 #ifdef CONFIG_NETDEVICES
 590         struct net *net = sock_net(sk);
 591
 592         /* Sorry... */
 593         ret = -EPERM;
 594         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 595                 goto out;
 596
 597         ret = -EINVAL;
 598         if (ifindex < 0)
 599                 goto out;
 600
 601         sk->sk_bound_dev_if = ifindex;
 602         if (sk->sk_prot->rehash)
 603                 sk->sk_prot->rehash(sk);
 604         sk_dst_reset(sk);
 605
 606         ret = 0;
 607
 608 out:
 609 #endif
 610
 611         return ret;
 612 }
 613
 614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 615 {
 616         int ret;
 617
 618         if (lock_sk)
 619                 lock_sock(sk);
 620         ret = sock_bindtoindex_locked(sk, ifindex);
 621         if (lock_sk)
 622                 release_sock(sk);
 623
 624         return ret;
 625 }
 626 EXPORT_SYMBOL(sock_bindtoindex);
 627
 628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 629 {
 630         int ret = -ENOPROTOOPT;
 631 #ifdef CONFIG_NETDEVICES
 632         struct net *net = sock_net(sk);
 633         char devname[IFNAMSIZ];
 634         int index;
 635
 636         ret = -EINVAL;
 637         if (optlen < 0)
 638                 goto out;
 639
 640         /* Bind this socket to a particular device like "eth0",
 641          * as specified in the passed interface name. If the
 642          * name is "" or the option length is zero the socket
 643          * is not bound.
 644          */
 645         if (optlen > IFNAMSIZ - 1)
 646                 optlen = IFNAMSIZ - 1;
 647         memset(devname, 0, sizeof(devname));
 648
 649         ret = -EFAULT;
 650         if (copy_from_sockptr(devname, optval, optlen))
 651                 goto out;
 652
 653         index = 0;
 654         if (devname[0] != '\0') {
 655                 struct net_device *dev;
 656
 657                 rcu_read_lock();
 658                 dev = dev_get_by_name_rcu(net, devname);
 659                 if (dev)
 660                         index = dev->ifindex;
 661                 rcu_read_unlock();
 662                 ret = -ENODEV;
 663                 if (!dev)
 664                         goto out;
 665         }
 666
 667         return sock_bindtoindex(sk, index, true);
 668 out:
 669 #endif
 670
 671         return ret;
 672 }
 673
 674 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 675                                 int __user *optlen, int len)
 676 {
 677         int ret = -ENOPROTOOPT;
 678 #ifdef CONFIG_NETDEVICES
 679         struct net *net = sock_net(sk);
 680         char devname[IFNAMSIZ];
 681
 682         if (sk->sk_bound_dev_if == 0) {
 683                 len = 0;
 684                 goto zero;
 685         }
 686
 687         ret = -EINVAL;
 688         if (len < IFNAMSIZ)
 689                 goto out;
 690
 691         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 692         if (ret)
 693                 goto out;
 694
 695         len = strlen(devname) + 1;
 696
 697         ret = -EFAULT;
 698         if (copy_to_user(optval, devname, len))
 699                 goto out;
 700
 701 zero:
 702         ret = -EFAULT;
 703         if (put_user(len, optlen))
 704                 goto out;
 705
 706         ret = 0;
 707
 708 out:
 709 #endif
 710
 711         return ret;
 712 }
 713
 714 bool sk_mc_loop(struct sock *sk)
 715 {
 716         if (dev_recursion_level())
 717                 return false;
 718         if (!sk)
 719                 return true;
 720         switch (sk->sk_family) {
 721         case AF_INET:
 722                 return inet_sk(sk)->mc_loop;
 723 #if IS_ENABLED(CONFIG_IPV6)
 724         case AF_INET6:
 725                 return inet6_sk(sk)->mc_loop;
 726 #endif
 727         }
 728         WARN_ON_ONCE(1);
 729         return true;
 730 }
 731 EXPORT_SYMBOL(sk_mc_loop);
 732
 733 void sock_set_reuseaddr(struct sock *sk)
 734 {
 735         lock_sock(sk);
 736         sk->sk_reuse = SK_CAN_REUSE;
 737         release_sock(sk);
 738 }
 739 EXPORT_SYMBOL(sock_set_reuseaddr);
 740
 741 void sock_set_reuseport(struct sock *sk)
 742 {
 743         lock_sock(sk);
 744         sk->sk_reuseport = true;
 745         release_sock(sk);
 746 }
 747 EXPORT_SYMBOL(sock_set_reuseport);
 748
 749 void sock_no_linger(struct sock *sk)
 750 {
 751         lock_sock(sk);
 752         sk->sk_lingertime = 0;
 753         sock_set_flag(sk, SOCK_LINGER);
 754         release_sock(sk);
 755 }
 756 EXPORT_SYMBOL(sock_no_linger);
 757
 758 void sock_set_priority(struct sock *sk, u32 priority)
 759 {
 760         lock_sock(sk);
 761         sk->sk_priority = priority;
 762         release_sock(sk);
 763 }
 764 EXPORT_SYMBOL(sock_set_priority);
 765
 766 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 767 {
 768         lock_sock(sk);
 769         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 770                 sk->sk_sndtimeo = secs * HZ;
 771         else
 772                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 773         release_sock(sk);
 774 }
 775 EXPORT_SYMBOL(sock_set_sndtimeo);
 776
 777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 778 {
 779         if (val)  {
 780                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 781                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 782                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 783                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 784         } else {
 785                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 786                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 787         }
 788 }
 789
 790 void sock_enable_timestamps(struct sock *sk)
 791 {
 792         lock_sock(sk);
 793         __sock_set_timestamps(sk, true, false, true);
 794         release_sock(sk);
 795 }
 796 EXPORT_SYMBOL(sock_enable_timestamps);
 797
 798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 799 {
 800         switch (optname) {
 801         case SO_TIMESTAMP_OLD:
 802                 __sock_set_timestamps(sk, valbool, false, false);
 803                 break;
 804         case SO_TIMESTAMP_NEW:
 805                 __sock_set_timestamps(sk, valbool, true, false);
 806                 break;
 807         case SO_TIMESTAMPNS_OLD:
 808                 __sock_set_timestamps(sk, valbool, false, true);
 809                 break;
 810         case SO_TIMESTAMPNS_NEW:
 811                 __sock_set_timestamps(sk, valbool, true, true);
 812                 break;
 813         }
 814 }
 815
 816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 817 {
 818         struct net *net = sock_net(sk);
 819         struct net_device *dev = NULL;
 820         bool match = false;
 821         int *vclock_index;
 822         int i, num;
 823
 824         if (sk->sk_bound_dev_if)
 825                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 826
 827         if (!dev) {
 828                 pr_err("%s: sock not bind to device\n", __func__);
 829                 return -EOPNOTSUPP;
 830         }
 831
 832         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 833         for (i = 0; i < num; i++) {
 834                 if (*(vclock_index + i) == phc_index) {
 835                         match = true;
 836                         break;
 837                 }
 838         }
 839
 840         if (num > 0)
 841                 kfree(vclock_index);
 842
 843         if (!match)
 844                 return -EINVAL;
 845
 846         sk->sk_bind_phc = phc_index;
 847
 848         return 0;
 849 }
 850
 851 int sock_set_timestamping(struct sock *sk, int optname,
 852                           struct so_timestamping timestamping)
 853 {
 854         int val = timestamping.flags;
 855         int ret;
 856
 857         if (val & ~SOF_TIMESTAMPING_MASK)
 858                 return -EINVAL;
 859
 860         if (val & SOF_TIMESTAMPING_OPT_ID &&
 861             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 862                 if (sk->sk_protocol == IPPROTO_TCP &&
 863                     sk->sk_type == SOCK_STREAM) {
 864                         if ((1 << sk->sk_state) &
 865                             (TCPF_CLOSE | TCPF_LISTEN))
 866                                 return -EINVAL;
 867                         sk->sk_tskey = tcp_sk(sk)->snd_una;
 868                 } else {
 869                         sk->sk_tskey = 0;
 870                 }
 871         }
 872
 873         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 874             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 875                 return -EINVAL;
 876
 877         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 878                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 879                 if (ret)
 880                         return ret;
 881         }
 882
 883         sk->sk_tsflags = val;
 884         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 885
 886         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 887                 sock_enable_timestamp(sk,
 888                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 889         else
 890                 sock_disable_timestamp(sk,
 891                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 892         return 0;
 893 }
 894
 895 void sock_set_keepalive(struct sock *sk)
 896 {
 897         lock_sock(sk);
 898         if (sk->sk_prot->keepalive)
 899                 sk->sk_prot->keepalive(sk, true);
 900         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 901         release_sock(sk);
 902 }
 903 EXPORT_SYMBOL(sock_set_keepalive);
 904
 905 static void __sock_set_rcvbuf(struct sock *sk, int val)
 906 {
 907         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 908          * as a negative value.
 909          */
 910         val = min_t(int, val, INT_MAX / 2);
 911         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 912
 913         /* We double it on the way in to account for "struct sk_buff" etc.
 914          * overhead.   Applications assume that the SO_RCVBUF setting they make
 915          * will allow that much actual data to be received on that socket.
 916          *
 917          * Applications are unaware that "struct sk_buff" and other overheads
 918          * allocate from the receive buffer during socket buffer allocation.
 919          *
 920          * And after considering the possible alternatives, returning the value
 921          * we actually used in getsockopt is the most desirable behavior.
 922          */
 923         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 924 }
 925
 926 void sock_set_rcvbuf(struct sock *sk, int val)
 927 {
 928         lock_sock(sk);
 929         __sock_set_rcvbuf(sk, val);
 930         release_sock(sk);
 931 }
 932 EXPORT_SYMBOL(sock_set_rcvbuf);
 933
 934 static void __sock_set_mark(struct sock *sk, u32 val)
 935 {
 936         if (val != sk->sk_mark) {
 937                 sk->sk_mark = val;
 938                 sk_dst_reset(sk);
 939         }
 940 }
 941
 942 void sock_set_mark(struct sock *sk, u32 val)
 943 {
 944         lock_sock(sk);
 945         __sock_set_mark(sk, val);
 946         release_sock(sk);
 947 }
 948 EXPORT_SYMBOL(sock_set_mark);
 949
 950 static void sock_release_reserved_memory(struct sock *sk, int bytes)
 951 {
 952         /* Round down bytes to multiple of pages */
 953         bytes &= ~(SK_MEM_QUANTUM - 1);
 954
 955         WARN_ON(bytes > sk->sk_reserved_mem);
 956         sk->sk_reserved_mem -= bytes;
 957         sk_mem_reclaim(sk);
 958 }
 959
 960 static int sock_reserve_memory(struct sock *sk, int bytes)
 961 {
 962         long allocated;
 963         bool charged;
 964         int pages;
 965
 966         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg)
 967                 return -EOPNOTSUPP;
 968
 969         if (!bytes)
 970                 return 0;
 971
 972         pages = sk_mem_pages(bytes);
 973
 974         /* pre-charge to memcg */
 975         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
 976                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 977         if (!charged)
 978                 return -ENOMEM;
 979
 980         /* pre-charge to forward_alloc */
 981         allocated = sk_memory_allocated_add(sk, pages);
 982         /* If the system goes into memory pressure with this
 983          * precharge, give up and return error.
 984          */
 985         if (allocated > sk_prot_mem_limits(sk, 1)) {
 986                 sk_memory_allocated_sub(sk, pages);
 987                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
 988                 return -ENOMEM;
 989         }
 990         sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
 991
 992         sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
 993
 994         return 0;
 995 }
 996
 997 /*
 998  *      This is meant for all protocols to use and covers goings on
 999  *      at the socket level. Everything here is generic.
1000  */
1001
1002 int sock_setsockopt(struct socket *sock, int level, int optname,
1003                     sockptr_t optval, unsigned int optlen)
1004 {
1005         struct so_timestamping timestamping;
1006         struct sock_txtime sk_txtime;
1007         struct sock *sk = sock->sk;
1008         int val;
1009         int valbool;
1010         struct linger ling;
1011         int ret = 0;
1012
1013         /*
1014          *      Options without arguments
1015          */
1016
1017         if (optname == SO_BINDTODEVICE)
1018                 return sock_setbindtodevice(sk, optval, optlen);
1019
1020         if (optlen < sizeof(int))
1021                 return -EINVAL;
1022
1023         if (copy_from_sockptr(&val, optval, sizeof(val)))
1024                 return -EFAULT;
1025
1026         valbool = val ? 1 : 0;
1027
1028         lock_sock(sk);
1029
1030         switch (optname) {
1031         case SO_DEBUG:
1032                 if (val && !capable(CAP_NET_ADMIN))
1033                         ret = -EACCES;
1034                 else
1035                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1036                 break;
1037         case SO_REUSEADDR:
1038                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1039                 break;
1040         case SO_REUSEPORT:
1041                 sk->sk_reuseport = valbool;
1042                 break;
1043         case SO_TYPE:
1044         case SO_PROTOCOL:
1045         case SO_DOMAIN:
1046         case SO_ERROR:
1047                 ret = -ENOPROTOOPT;
1048                 break;
1049         case SO_DONTROUTE:
1050                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1051                 sk_dst_reset(sk);
1052                 break;
1053         case SO_BROADCAST:
1054                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1055                 break;
1056         case SO_SNDBUF:
1057                 /* Don't error on this BSD doesn't and if you think
1058                  * about it this is right. Otherwise apps have to
1059                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1060                  * are treated in BSD as hints
1061                  */
1062                 val = min_t(u32, val, sysctl_wmem_max);
1063 set_sndbuf:
1064                 /* Ensure val * 2 fits into an int, to prevent max_t()
1065                  * from treating it as a negative value.
1066                  */
1067                 val = min_t(int, val, INT_MAX / 2);
1068                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1069                 WRITE_ONCE(sk->sk_sndbuf,
1070                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1071                 /* Wake up sending tasks if we upped the value. */
1072                 sk->sk_write_space(sk);
1073                 break;
1074
1075         case SO_SNDBUFFORCE:
1076                 if (!capable(CAP_NET_ADMIN)) {
1077                         ret = -EPERM;
1078                         break;
1079                 }
1080
1081                 /* No negative values (to prevent underflow, as val will be
1082                  * multiplied by 2).
1083                  */
1084                 if (val < 0)
1085                         val = 0;
1086                 goto set_sndbuf;
1087
1088         case SO_RCVBUF:
1089                 /* Don't error on this BSD doesn't and if you think
1090                  * about it this is right. Otherwise apps have to
1091                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1092                  * are treated in BSD as hints
1093                  */
1094                 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1095                 break;
1096
1097         case SO_RCVBUFFORCE:
1098                 if (!capable(CAP_NET_ADMIN)) {
1099                         ret = -EPERM;
1100                         break;
1101                 }
1102
1103                 /* No negative values (to prevent underflow, as val will be
1104                  * multiplied by 2).
1105                  */
1106                 __sock_set_rcvbuf(sk, max(val, 0));
1107                 break;
1108
1109         case SO_KEEPALIVE:
1110                 if (sk->sk_prot->keepalive)
1111                         sk->sk_prot->keepalive(sk, valbool);
1112                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1113                 break;
1114
1115         case SO_OOBINLINE:
1116                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1117                 break;
1118
1119         case SO_NO_CHECK:
1120                 sk->sk_no_check_tx = valbool;
1121                 break;
1122
1123         case SO_PRIORITY:
1124                 if ((val >= 0 && val <= 6) ||
1125                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1126                         sk->sk_priority = val;
1127                 else
1128                         ret = -EPERM;
1129                 break;
1130
1131         case SO_LINGER:
1132                 if (optlen < sizeof(ling)) {
1133                         ret = -EINVAL;  /* 1003.1g */
1134                         break;
1135                 }
1136                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1137                         ret = -EFAULT;
1138                         break;
1139                 }
1140                 if (!ling.l_onoff)
1141                         sock_reset_flag(sk, SOCK_LINGER);
1142                 else {
1143 #if (BITS_PER_LONG == 32)
1144                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1145                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1146                         else
1147 #endif
1148                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1149                         sock_set_flag(sk, SOCK_LINGER);
1150                 }
1151                 break;
1152
1153         case SO_BSDCOMPAT:
1154                 break;
1155
1156         case SO_PASSCRED:
1157                 if (valbool)
1158                         set_bit(SOCK_PASSCRED, &sock->flags);
1159                 else
1160                         clear_bit(SOCK_PASSCRED, &sock->flags);
1161                 break;
1162
1163         case SO_TIMESTAMP_OLD:
1164         case SO_TIMESTAMP_NEW:
1165         case SO_TIMESTAMPNS_OLD:
1166         case SO_TIMESTAMPNS_NEW:
1167                 sock_set_timestamp(sk, optname, valbool);
1168                 break;
1169
1170         case SO_TIMESTAMPING_NEW:
1171         case SO_TIMESTAMPING_OLD:
1172                 if (optlen == sizeof(timestamping)) {
1173                         if (copy_from_sockptr(&timestamping, optval,
1174                                               sizeof(timestamping))) {
1175                                 ret = -EFAULT;
1176                                 break;
1177                         }
1178                 } else {
1179                         memset(&timestamping, 0, sizeof(timestamping));
1180                         timestamping.flags = val;
1181                 }
1182                 ret = sock_set_timestamping(sk, optname, timestamping);
1183                 break;
1184
1185         case SO_RCVLOWAT:
1186                 if (val < 0)
1187                         val = INT_MAX;
1188                 if (sock->ops->set_rcvlowat)
1189                         ret = sock->ops->set_rcvlowat(sk, val);
1190                 else
1191                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1192                 break;
1193
1194         case SO_RCVTIMEO_OLD:
1195         case SO_RCVTIMEO_NEW:
1196                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1197                                        optlen, optname == SO_RCVTIMEO_OLD);
1198                 break;
1199
1200         case SO_SNDTIMEO_OLD:
1201         case SO_SNDTIMEO_NEW:
1202                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1203                                        optlen, optname == SO_SNDTIMEO_OLD);
1204                 break;
1205
1206         case SO_ATTACH_FILTER: {
1207                 struct sock_fprog fprog;
1208
1209                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1210                 if (!ret)
1211                         ret = sk_attach_filter(&fprog, sk);
1212                 break;
1213         }
1214         case SO_ATTACH_BPF:
1215                 ret = -EINVAL;
1216                 if (optlen == sizeof(u32)) {
1217                         u32 ufd;
1218
1219                         ret = -EFAULT;
1220                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1221                                 break;
1222
1223                         ret = sk_attach_bpf(ufd, sk);
1224                 }
1225                 break;
1226
1227         case SO_ATTACH_REUSEPORT_CBPF: {
1228                 struct sock_fprog fprog;
1229
1230                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1231                 if (!ret)
1232                         ret = sk_reuseport_attach_filter(&fprog, sk);
1233                 break;
1234         }
1235         case SO_ATTACH_REUSEPORT_EBPF:
1236                 ret = -EINVAL;
1237                 if (optlen == sizeof(u32)) {
1238                         u32 ufd;
1239
1240                         ret = -EFAULT;
1241                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1242                                 break;
1243
1244                         ret = sk_reuseport_attach_bpf(ufd, sk);
1245                 }
1246                 break;
1247
1248         case SO_DETACH_REUSEPORT_BPF:
1249                 ret = reuseport_detach_prog(sk);
1250                 break;
1251
1252         case SO_DETACH_FILTER:
1253                 ret = sk_detach_filter(sk);
1254                 break;
1255
1256         case SO_LOCK_FILTER:
1257                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1258                         ret = -EPERM;
1259                 else
1260                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1261                 break;
1262
1263         case SO_PASSSEC:
1264                 if (valbool)
1265                         set_bit(SOCK_PASSSEC, &sock->flags);
1266                 else
1267                         clear_bit(SOCK_PASSSEC, &sock->flags);
1268                 break;
1269         case SO_MARK:
1270                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1271                         ret = -EPERM;
1272                         break;
1273                 }
1274
1275                 __sock_set_mark(sk, val);
1276                 break;
1277
1278         case SO_RXQ_OVFL:
1279                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1280                 break;
1281
1282         case SO_WIFI_STATUS:
1283                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1284                 break;
1285
1286         case SO_PEEK_OFF:
1287                 if (sock->ops->set_peek_off)
1288                         ret = sock->ops->set_peek_off(sk, val);
1289                 else
1290                         ret = -EOPNOTSUPP;
1291                 break;
1292
1293         case SO_NOFCS:
1294                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1295                 break;
1296
1297         case SO_SELECT_ERR_QUEUE:
1298                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1299                 break;
1300
1301 #ifdef CONFIG_NET_RX_BUSY_POLL
1302         case SO_BUSY_POLL:
1303                 /* allow unprivileged users to decrease the value */
1304                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1305                         ret = -EPERM;
1306                 else {
1307                         if (val < 0)
1308                                 ret = -EINVAL;
1309                         else
1310                                 WRITE_ONCE(sk->sk_ll_usec, val);
1311                 }
1312                 break;
1313         case SO_PREFER_BUSY_POLL:
1314                 if (valbool && !capable(CAP_NET_ADMIN))
1315                         ret = -EPERM;
1316                 else
1317                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1318                 break;
1319         case SO_BUSY_POLL_BUDGET:
1320                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1321                         ret = -EPERM;
1322                 } else {
1323                         if (val < 0 || val > U16_MAX)
1324                                 ret = -EINVAL;
1325                         else
1326                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1327                 }
1328                 break;
1329 #endif
1330
1331         case SO_MAX_PACING_RATE:
1332                 {
1333                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1334
1335                 if (sizeof(ulval) != sizeof(val) &&
1336                     optlen >= sizeof(ulval) &&
1337                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1338                         ret = -EFAULT;
1339                         break;
1340                 }
1341                 if (ulval != ~0UL)
1342                         cmpxchg(&sk->sk_pacing_status,
1343                                 SK_PACING_NONE,
1344                                 SK_PACING_NEEDED);
1345                 sk->sk_max_pacing_rate = ulval;
1346                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1347                 break;
1348                 }
1349         case SO_INCOMING_CPU:
1350                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1351                 break;
1352
1353         case SO_CNX_ADVICE:
1354                 if (val == 1)
1355                         dst_negative_advice(sk);
1356                 break;
1357
1358         case SO_ZEROCOPY:
1359                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1360                         if (!((sk->sk_type == SOCK_STREAM &&
1361                                sk->sk_protocol == IPPROTO_TCP) ||
1362                               (sk->sk_type == SOCK_DGRAM &&
1363                                sk->sk_protocol == IPPROTO_UDP)))
1364                                 ret = -ENOTSUPP;
1365                 } else if (sk->sk_family != PF_RDS) {
1366                         ret = -ENOTSUPP;
1367                 }
1368                 if (!ret) {
1369                         if (val < 0 || val > 1)
1370                                 ret = -EINVAL;
1371                         else
1372                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1373                 }
1374                 break;
1375
1376         case SO_TXTIME:
1377                 if (optlen != sizeof(struct sock_txtime)) {
1378                         ret = -EINVAL;
1379                         break;
1380                 } else if (copy_from_sockptr(&sk_txtime, optval,
1381                            sizeof(struct sock_txtime))) {
1382                         ret = -EFAULT;
1383                         break;
1384                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1385                         ret = -EINVAL;
1386                         break;
1387                 }
1388                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1389                  * scheduler has enough safe guards.
1390                  */
1391                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1392                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1393                         ret = -EPERM;
1394                         break;
1395                 }
1396                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1397                 sk->sk_clockid = sk_txtime.clockid;
1398                 sk->sk_txtime_deadline_mode =
1399                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1400                 sk->sk_txtime_report_errors =
1401                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1402                 break;
1403
1404         case SO_BINDTOIFINDEX:
1405                 ret = sock_bindtoindex_locked(sk, val);
1406                 break;
1407
1408         case SO_BUF_LOCK:
1409                 if (val & ~SOCK_BUF_LOCK_MASK) {
1410                         ret = -EINVAL;
1411                         break;
1412                 }
1413                 sk->sk_userlocks = val | (sk->sk_userlocks &
1414                                           ~SOCK_BUF_LOCK_MASK);
1415                 break;
1416
1417         case SO_RESERVE_MEM:
1418         {
1419                 int delta;
1420
1421                 if (val < 0) {
1422                         ret = -EINVAL;
1423                         break;
1424                 }
1425
1426                 delta = val - sk->sk_reserved_mem;
1427                 if (delta < 0)
1428                         sock_release_reserved_memory(sk, -delta);
1429                 else
1430                         ret = sock_reserve_memory(sk, delta);
1431                 break;
1432         }
1433
1434         default:
1435                 ret = -ENOPROTOOPT;
1436                 break;
1437         }
1438         release_sock(sk);
1439         return ret;
1440 }
1441 EXPORT_SYMBOL(sock_setsockopt);
1442
1443 static const struct cred *sk_get_peer_cred(struct sock *sk)
1444 {
1445         const struct cred *cred;
1446
1447         spin_lock(&sk->sk_peer_lock);
1448         cred = get_cred(sk->sk_peer_cred);
1449         spin_unlock(&sk->sk_peer_lock);
1450
1451         return cred;
1452 }
1453
1454 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1455                           struct ucred *ucred)
1456 {
1457         ucred->pid = pid_vnr(pid);
1458         ucred->uid = ucred->gid = -1;
1459         if (cred) {
1460                 struct user_namespace *current_ns = current_user_ns();
1461
1462                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1463                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1464         }
1465 }
1466
1467 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1468 {
1469         struct user_namespace *user_ns = current_user_ns();
1470         int i;
1471
1472         for (i = 0; i < src->ngroups; i++)
1473                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1474                         return -EFAULT;
1475
1476         return 0;
1477 }
1478
1479 int sock_getsockopt(struct socket *sock, int level, int optname,
1480                     char __user *optval, int __user *optlen)
1481 {
1482         struct sock *sk = sock->sk;
1483
1484         union {
1485                 int val;
1486                 u64 val64;
1487                 unsigned long ulval;
1488                 struct linger ling;
1489                 struct old_timeval32 tm32;
1490                 struct __kernel_old_timeval tm;
1491                 struct  __kernel_sock_timeval stm;
1492                 struct sock_txtime txtime;
1493                 struct so_timestamping timestamping;
1494         } v;
1495
1496         int lv = sizeof(int);
1497         int len;
1498
1499         if (get_user(len, optlen))
1500                 return -EFAULT;
1501         if (len < 0)
1502                 return -EINVAL;
1503
1504         memset(&v, 0, sizeof(v));
1505
1506         switch (optname) {
1507         case SO_DEBUG:
1508                 v.val = sock_flag(sk, SOCK_DBG);
1509                 break;
1510
1511         case SO_DONTROUTE:
1512                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1513                 break;
1514
1515         case SO_BROADCAST:
1516                 v.val = sock_flag(sk, SOCK_BROADCAST);
1517                 break;
1518
1519         case SO_SNDBUF:
1520                 v.val = sk->sk_sndbuf;
1521                 break;
1522
1523         case SO_RCVBUF:
1524                 v.val = sk->sk_rcvbuf;
1525                 break;
1526
1527         case SO_REUSEADDR:
1528                 v.val = sk->sk_reuse;
1529                 break;
1530
1531         case SO_REUSEPORT:
1532                 v.val = sk->sk_reuseport;
1533                 break;
1534
1535         case SO_KEEPALIVE:
1536                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1537                 break;
1538
1539         case SO_TYPE:
1540                 v.val = sk->sk_type;
1541                 break;
1542
1543         case SO_PROTOCOL:
1544                 v.val = sk->sk_protocol;
1545                 break;
1546
1547         case SO_DOMAIN:
1548                 v.val = sk->sk_family;
1549                 break;
1550
1551         case SO_ERROR:
1552                 v.val = -sock_error(sk);
1553                 if (v.val == 0)
1554                         v.val = xchg(&sk->sk_err_soft, 0);
1555                 break;
1556
1557         case SO_OOBINLINE:
1558                 v.val = sock_flag(sk, SOCK_URGINLINE);
1559                 break;
1560
1561         case SO_NO_CHECK:
1562                 v.val = sk->sk_no_check_tx;
1563                 break;
1564
1565         case SO_PRIORITY:
1566                 v.val = sk->sk_priority;
1567                 break;
1568
1569         case SO_LINGER:
1570                 lv              = sizeof(v.ling);
1571                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1572                 v.ling.l_linger = sk->sk_lingertime / HZ;
1573                 break;
1574
1575         case SO_BSDCOMPAT:
1576                 break;
1577
1578         case SO_TIMESTAMP_OLD:
1579                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1580                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1581                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1582                 break;
1583
1584         case SO_TIMESTAMPNS_OLD:
1585                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1586                 break;
1587
1588         case SO_TIMESTAMP_NEW:
1589                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1590                 break;
1591
1592         case SO_TIMESTAMPNS_NEW:
1593                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1594                 break;
1595
1596         case SO_TIMESTAMPING_OLD:
1597                 lv = sizeof(v.timestamping);
1598                 v.timestamping.flags = sk->sk_tsflags;
1599                 v.timestamping.bind_phc = sk->sk_bind_phc;
1600                 break;
1601
1602         case SO_RCVTIMEO_OLD:
1603         case SO_RCVTIMEO_NEW:
1604                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1605                 break;
1606
1607         case SO_SNDTIMEO_OLD:
1608         case SO_SNDTIMEO_NEW:
1609                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1610                 break;
1611
1612         case SO_RCVLOWAT:
1613                 v.val = sk->sk_rcvlowat;
1614                 break;
1615
1616         case SO_SNDLOWAT:
1617                 v.val = 1;
1618                 break;
1619
1620         case SO_PASSCRED:
1621                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1622                 break;
1623
1624         case SO_PEERCRED:
1625         {
1626                 struct ucred peercred;
1627                 if (len > sizeof(peercred))
1628                         len = sizeof(peercred);
1629
1630                 spin_lock(&sk->sk_peer_lock);
1631                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1632                 spin_unlock(&sk->sk_peer_lock);
1633
1634                 if (copy_to_user(optval, &peercred, len))
1635                         return -EFAULT;
1636                 goto lenout;
1637         }
1638
1639         case SO_PEERGROUPS:
1640         {
1641                 const struct cred *cred;
1642                 int ret, n;
1643
1644                 cred = sk_get_peer_cred(sk);
1645                 if (!cred)
1646                         return -ENODATA;
1647
1648                 n = cred->group_info->ngroups;
1649                 if (len < n * sizeof(gid_t)) {
1650                         len = n * sizeof(gid_t);
1651                         put_cred(cred);
1652                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1653                 }
1654                 len = n * sizeof(gid_t);
1655
1656                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1657                 put_cred(cred);
1658                 if (ret)
1659                         return ret;
1660                 goto lenout;
1661         }
1662
1663         case SO_PEERNAME:
1664         {
1665                 char address[128];
1666
1667                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1668                 if (lv < 0)
1669                         return -ENOTCONN;
1670                 if (lv < len)
1671                         return -EINVAL;
1672                 if (copy_to_user(optval, address, len))
1673                         return -EFAULT;
1674                 goto lenout;
1675         }
1676
1677         /* Dubious BSD thing... Probably nobody even uses it, but
1678          * the UNIX standard wants it for whatever reason... -DaveM
1679          */
1680         case SO_ACCEPTCONN:
1681                 v.val = sk->sk_state == TCP_LISTEN;
1682                 break;
1683
1684         case SO_PASSSEC:
1685                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1686                 break;
1687
1688         case SO_PEERSEC:
1689                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1690
1691         case SO_MARK:
1692                 v.val = sk->sk_mark;
1693                 break;
1694
1695         case SO_RXQ_OVFL:
1696                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1697                 break;
1698
1699         case SO_WIFI_STATUS:
1700                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1701                 break;
1702
1703         case SO_PEEK_OFF:
1704                 if (!sock->ops->set_peek_off)
1705                         return -EOPNOTSUPP;
1706
1707                 v.val = sk->sk_peek_off;
1708                 break;
1709         case SO_NOFCS:
1710                 v.val = sock_flag(sk, SOCK_NOFCS);
1711                 break;
1712
1713         case SO_BINDTODEVICE:
1714                 return sock_getbindtodevice(sk, optval, optlen, len);
1715
1716         case SO_GET_FILTER:
1717                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1718                 if (len < 0)
1719                         return len;
1720
1721                 goto lenout;
1722
1723         case SO_LOCK_FILTER:
1724                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1725                 break;
1726
1727         case SO_BPF_EXTENSIONS:
1728                 v.val = bpf_tell_extensions();
1729                 break;
1730
1731         case SO_SELECT_ERR_QUEUE:
1732                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1733                 break;
1734
1735 #ifdef CONFIG_NET_RX_BUSY_POLL
1736         case SO_BUSY_POLL:
1737                 v.val = sk->sk_ll_usec;
1738                 break;
1739         case SO_PREFER_BUSY_POLL:
1740                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1741                 break;
1742 #endif
1743
1744         case SO_MAX_PACING_RATE:
1745                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1746                         lv = sizeof(v.ulval);
1747                         v.ulval = sk->sk_max_pacing_rate;
1748                 } else {
1749                         /* 32bit version */
1750                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1751                 }
1752                 break;
1753
1754         case SO_INCOMING_CPU:
1755                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1756                 break;
1757
1758         case SO_MEMINFO:
1759         {
1760                 u32 meminfo[SK_MEMINFO_VARS];
1761
1762                 sk_get_meminfo(sk, meminfo);
1763
1764                 len = min_t(unsigned int, len, sizeof(meminfo));
1765                 if (copy_to_user(optval, &meminfo, len))
1766                         return -EFAULT;
1767
1768                 goto lenout;
1769         }
1770
1771 #ifdef CONFIG_NET_RX_BUSY_POLL
1772         case SO_INCOMING_NAPI_ID:
1773                 v.val = READ_ONCE(sk->sk_napi_id);
1774
1775                 /* aggregate non-NAPI IDs down to 0 */
1776                 if (v.val < MIN_NAPI_ID)
1777                         v.val = 0;
1778
1779                 break;
1780 #endif
1781
1782         case SO_COOKIE:
1783                 lv = sizeof(u64);
1784                 if (len < lv)
1785                         return -EINVAL;
1786                 v.val64 = sock_gen_cookie(sk);
1787                 break;
1788
1789         case SO_ZEROCOPY:
1790                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1791                 break;
1792
1793         case SO_TXTIME:
1794                 lv = sizeof(v.txtime);
1795                 v.txtime.clockid = sk->sk_clockid;
1796                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1797                                   SOF_TXTIME_DEADLINE_MODE : 0;
1798                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1799                                   SOF_TXTIME_REPORT_ERRORS : 0;
1800                 break;
1801
1802         case SO_BINDTOIFINDEX:
1803                 v.val = sk->sk_bound_dev_if;
1804                 break;
1805
1806         case SO_NETNS_COOKIE:
1807                 lv = sizeof(u64);
1808                 if (len != lv)
1809                         return -EINVAL;
1810                 v.val64 = sock_net(sk)->net_cookie;
1811                 break;
1812
1813         case SO_BUF_LOCK:
1814                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1815                 break;
1816
1817         case SO_RESERVE_MEM:
1818                 v.val = sk->sk_reserved_mem;
1819                 break;
1820
1821         default:
1822                 /* We implement the SO_SNDLOWAT etc to not be settable
1823                  * (1003.1g 7).
1824                  */
1825                 return -ENOPROTOOPT;
1826         }
1827
1828         if (len > lv)
1829                 len = lv;
1830         if (copy_to_user(optval, &v, len))
1831                 return -EFAULT;
1832 lenout:
1833         if (put_user(len, optlen))
1834                 return -EFAULT;
1835         return 0;
1836 }
1837
1838 /*
1839  * Initialize an sk_lock.
1840  *
1841  * (We also register the sk_lock with the lock validator.)
1842  */
1843 static inline void sock_lock_init(struct sock *sk)
1844 {
1845         if (sk->sk_kern_sock)
1846                 sock_lock_init_class_and_name(
1847                         sk,
1848                         af_family_kern_slock_key_strings[sk->sk_family],
1849                         af_family_kern_slock_keys + sk->sk_family,
1850                         af_family_kern_key_strings[sk->sk_family],
1851                         af_family_kern_keys + sk->sk_family);
1852         else
1853                 sock_lock_init_class_and_name(
1854                         sk,
1855                         af_family_slock_key_strings[sk->sk_family],
1856                         af_family_slock_keys + sk->sk_family,
1857                         af_family_key_strings[sk->sk_family],
1858                         af_family_keys + sk->sk_family);
1859 }
1860
1861 /*
1862  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1863  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1864  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1865  */
1866 static void sock_copy(struct sock *nsk, const struct sock *osk)
1867 {
1868         const struct proto *prot = READ_ONCE(osk->sk_prot);
1869 #ifdef CONFIG_SECURITY_NETWORK
1870         void *sptr = nsk->sk_security;
1871 #endif
1872
1873         /* If we move sk_tx_queue_mapping out of the private section,
1874          * we must check if sk_tx_queue_clear() is called after
1875          * sock_copy() in sk_clone_lock().
1876          */
1877         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1878                      offsetof(struct sock, sk_dontcopy_begin) ||
1879                      offsetof(struct sock, sk_tx_queue_mapping) >=
1880                      offsetof(struct sock, sk_dontcopy_end));
1881
1882         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1883
1884         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1885                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1886
1887 #ifdef CONFIG_SECURITY_NETWORK
1888         nsk->sk_security = sptr;
1889         security_sk_clone(osk, nsk);
1890 #endif
1891 }
1892
1893 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1894                 int family)
1895 {
1896         struct sock *sk;
1897         struct kmem_cache *slab;
1898
1899         slab = prot->slab;
1900         if (slab != NULL) {
1901                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1902                 if (!sk)
1903                         return sk;
1904                 if (want_init_on_alloc(priority))
1905                         sk_prot_clear_nulls(sk, prot->obj_size);
1906         } else
1907                 sk = kmalloc(prot->obj_size, priority);
1908
1909         if (sk != NULL) {
1910                 if (security_sk_alloc(sk, family, priority))
1911                         goto out_free;
1912
1913                 if (!try_module_get(prot->owner))
1914                         goto out_free_sec;
1915         }
1916
1917         return sk;
1918
1919 out_free_sec:
1920         security_sk_free(sk);
1921 out_free:
1922         if (slab != NULL)
1923                 kmem_cache_free(slab, sk);
1924         else
1925                 kfree(sk);
1926         return NULL;
1927 }
1928
1929 static void sk_prot_free(struct proto *prot, struct sock *sk)
1930 {
1931         struct kmem_cache *slab;
1932         struct module *owner;
1933
1934         owner = prot->owner;
1935         slab = prot->slab;
1936
1937         cgroup_sk_free(&sk->sk_cgrp_data);
1938         mem_cgroup_sk_free(sk);
1939         security_sk_free(sk);
1940         if (slab != NULL)
1941                 kmem_cache_free(slab, sk);
1942         else
1943                 kfree(sk);
1944         module_put(owner);
1945 }
1946
1947 /**
1948  *      sk_alloc - All socket objects are allocated here
1949  *      @net: the applicable net namespace
1950  *      @family: protocol family
1951  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1952  *      @prot: struct proto associated with this new sock instance
1953  *      @kern: is this to be a kernel socket?
1954  */
1955 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1956                       struct proto *prot, int kern)
1957 {
1958         struct sock *sk;
1959
1960         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1961         if (sk) {
1962                 sk->sk_family = family;
1963                 /*
1964                  * See comment in struct sock definition to understand
1965                  * why we need sk_prot_creator -acme
1966                  */
1967                 sk->sk_prot = sk->sk_prot_creator = prot;
1968                 sk->sk_kern_sock = kern;
1969                 sock_lock_init(sk);
1970                 sk->sk_net_refcnt = kern ? 0 : 1;
1971                 if (likely(sk->sk_net_refcnt)) {
1972                         get_net(net);
1973                         sock_inuse_add(net, 1);
1974                 }
1975
1976                 sock_net_set(sk, net);
1977                 refcount_set(&sk->sk_wmem_alloc, 1);
1978
1979                 mem_cgroup_sk_alloc(sk);
1980                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1981                 sock_update_classid(&sk->sk_cgrp_data);
1982                 sock_update_netprioidx(&sk->sk_cgrp_data);
1983                 sk_tx_queue_clear(sk);
1984         }
1985
1986         return sk;
1987 }
1988 EXPORT_SYMBOL(sk_alloc);
1989
1990 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1991  * grace period. This is the case for UDP sockets and TCP listeners.
1992  */
1993 static void __sk_destruct(struct rcu_head *head)
1994 {
1995         struct sock *sk = container_of(head, struct sock, sk_rcu);
1996         struct sk_filter *filter;
1997
1998         if (sk->sk_destruct)
1999                 sk->sk_destruct(sk);
2000
2001         filter = rcu_dereference_check(sk->sk_filter,
2002                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2003         if (filter) {
2004                 sk_filter_uncharge(sk, filter);
2005                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2006         }
2007
2008         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2009
2010 #ifdef CONFIG_BPF_SYSCALL
2011         bpf_sk_storage_free(sk);
2012 #endif
2013
2014         if (atomic_read(&sk->sk_omem_alloc))
2015                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2016                          __func__, atomic_read(&sk->sk_omem_alloc));
2017
2018         if (sk->sk_frag.page) {
2019                 put_page(sk->sk_frag.page);
2020                 sk->sk_frag.page = NULL;
2021         }
2022
2023         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2024         put_cred(sk->sk_peer_cred);
2025         put_pid(sk->sk_peer_pid);
2026
2027         if (likely(sk->sk_net_refcnt))
2028                 put_net(sock_net(sk));
2029         sk_prot_free(sk->sk_prot_creator, sk);
2030 }
2031
2032 void sk_destruct(struct sock *sk)
2033 {
2034         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2035
2036         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2037                 reuseport_detach_sock(sk);
2038                 use_call_rcu = true;
2039         }
2040
2041         if (use_call_rcu)
2042                 call_rcu(&sk->sk_rcu, __sk_destruct);
2043         else
2044                 __sk_destruct(&sk->sk_rcu);
2045 }
2046
2047 static void __sk_free(struct sock *sk)
2048 {
2049         if (likely(sk->sk_net_refcnt))
2050                 sock_inuse_add(sock_net(sk), -1);
2051
2052         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2053                 sock_diag_broadcast_destroy(sk);
2054         else
2055                 sk_destruct(sk);
2056 }
2057
2058 void sk_free(struct sock *sk)
2059 {
2060         /*
2061          * We subtract one from sk_wmem_alloc and can know if
2062          * some packets are still in some tx queue.
2063          * If not null, sock_wfree() will call __sk_free(sk) later
2064          */
2065         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2066                 __sk_free(sk);
2067 }
2068 EXPORT_SYMBOL(sk_free);
2069
2070 static void sk_init_common(struct sock *sk)
2071 {
2072         skb_queue_head_init(&sk->sk_receive_queue);
2073         skb_queue_head_init(&sk->sk_write_queue);
2074         skb_queue_head_init(&sk->sk_error_queue);
2075
2076         rwlock_init(&sk->sk_callback_lock);
2077         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2078                         af_rlock_keys + sk->sk_family,
2079                         af_family_rlock_key_strings[sk->sk_family]);
2080         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2081                         af_wlock_keys + sk->sk_family,
2082                         af_family_wlock_key_strings[sk->sk_family]);
2083         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2084                         af_elock_keys + sk->sk_family,
2085                         af_family_elock_key_strings[sk->sk_family]);
2086         lockdep_set_class_and_name(&sk->sk_callback_lock,
2087                         af_callback_keys + sk->sk_family,
2088                         af_family_clock_key_strings[sk->sk_family]);
2089 }
2090
2091 /**
2092  *      sk_clone_lock - clone a socket, and lock its clone
2093  *      @sk: the socket to clone
2094  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2095  *
2096  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2097  */
2098 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2099 {
2100         struct proto *prot = READ_ONCE(sk->sk_prot);
2101         struct sk_filter *filter;
2102         bool is_charged = true;
2103         struct sock *newsk;
2104
2105         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2106         if (!newsk)
2107                 goto out;
2108
2109         sock_copy(newsk, sk);
2110
2111         newsk->sk_prot_creator = prot;
2112
2113         /* SANITY */
2114         if (likely(newsk->sk_net_refcnt))
2115                 get_net(sock_net(newsk));
2116         sk_node_init(&newsk->sk_node);
2117         sock_lock_init(newsk);
2118         bh_lock_sock(newsk);
2119         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2120         newsk->sk_backlog.len = 0;
2121
2122         atomic_set(&newsk->sk_rmem_alloc, 0);
2123
2124         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2125         refcount_set(&newsk->sk_wmem_alloc, 1);
2126
2127         atomic_set(&newsk->sk_omem_alloc, 0);
2128         sk_init_common(newsk);
2129
2130         newsk->sk_dst_cache     = NULL;
2131         newsk->sk_dst_pending_confirm = 0;
2132         newsk->sk_wmem_queued   = 0;
2133         newsk->sk_forward_alloc = 0;
2134         newsk->sk_reserved_mem  = 0;
2135         atomic_set(&newsk->sk_drops, 0);
2136         newsk->sk_send_head     = NULL;
2137         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2138         atomic_set(&newsk->sk_zckey, 0);
2139
2140         sock_reset_flag(newsk, SOCK_DONE);
2141
2142         /* sk->sk_memcg will be populated at accept() time */
2143         newsk->sk_memcg = NULL;
2144
2145         cgroup_sk_clone(&newsk->sk_cgrp_data);
2146
2147         rcu_read_lock();
2148         filter = rcu_dereference(sk->sk_filter);
2149         if (filter != NULL)
2150                 /* though it's an empty new sock, the charging may fail
2151                  * if sysctl_optmem_max was changed between creation of
2152                  * original socket and cloning
2153                  */
2154                 is_charged = sk_filter_charge(newsk, filter);
2155         RCU_INIT_POINTER(newsk->sk_filter, filter);
2156         rcu_read_unlock();
2157
2158         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2159                 /* We need to make sure that we don't uncharge the new
2160                  * socket if we couldn't charge it in the first place
2161                  * as otherwise we uncharge the parent's filter.
2162                  */
2163                 if (!is_charged)
2164                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2165                 sk_free_unlock_clone(newsk);
2166                 newsk = NULL;
2167                 goto out;
2168         }
2169         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2170
2171         if (bpf_sk_storage_clone(sk, newsk)) {
2172                 sk_free_unlock_clone(newsk);
2173                 newsk = NULL;
2174                 goto out;
2175         }
2176
2177         /* Clear sk_user_data if parent had the pointer tagged
2178          * as not suitable for copying when cloning.
2179          */
2180         if (sk_user_data_is_nocopy(newsk))
2181                 newsk->sk_user_data = NULL;
2182
2183         newsk->sk_err      = 0;
2184         newsk->sk_err_soft = 0;
2185         newsk->sk_priority = 0;
2186         newsk->sk_incoming_cpu = raw_smp_processor_id();
2187         if (likely(newsk->sk_net_refcnt))
2188                 sock_inuse_add(sock_net(newsk), 1);
2189
2190         /* Before updating sk_refcnt, we must commit prior changes to memory
2191          * (Documentation/RCU/rculist_nulls.rst for details)
2192          */
2193         smp_wmb();
2194         refcount_set(&newsk->sk_refcnt, 2);
2195
2196         /* Increment the counter in the same struct proto as the master
2197          * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2198          * is the same as sk->sk_prot->socks, as this field was copied
2199          * with memcpy).
2200          *
2201          * This _changes_ the previous behaviour, where
2202          * tcp_create_openreq_child always was incrementing the
2203          * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2204          * to be taken into account in all callers. -acme
2205          */
2206         sk_refcnt_debug_inc(newsk);
2207         sk_set_socket(newsk, NULL);
2208         sk_tx_queue_clear(newsk);
2209         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2210
2211         if (newsk->sk_prot->sockets_allocated)
2212                 sk_sockets_allocated_inc(newsk);
2213
2214         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2215                 net_enable_timestamp();
2216 out:
2217         return newsk;
2218 }
2219 EXPORT_SYMBOL_GPL(sk_clone_lock);
2220
2221 void sk_free_unlock_clone(struct sock *sk)
2222 {
2223         /* It is still raw copy of parent, so invalidate
2224          * destructor and make plain sk_free() */
2225         sk->sk_destruct = NULL;
2226         bh_unlock_sock(sk);
2227         sk_free(sk);
2228 }
2229 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2230
2231 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2232 {
2233         u32 max_segs = 1;
2234
2235         sk_dst_set(sk, dst);
2236         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2237         if (sk->sk_route_caps & NETIF_F_GSO)
2238                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2239         sk->sk_route_caps &= ~sk->sk_route_nocaps;
2240         if (sk_can_gso(sk)) {
2241                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2242                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2243                 } else {
2244                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2245                         sk->sk_gso_max_size = dst->dev->gso_max_size;
2246                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2247                 }
2248         }
2249         sk->sk_gso_max_segs = max_segs;
2250 }
2251 EXPORT_SYMBOL_GPL(sk_setup_caps);
2252
2253 /*
2254  *      Simple resource managers for sockets.
2255  */
2256
2257
2258 /*
2259  * Write buffer destructor automatically called from kfree_skb.
2260  */
2261 void sock_wfree(struct sk_buff *skb)
2262 {
2263         struct sock *sk = skb->sk;
2264         unsigned int len = skb->truesize;
2265
2266         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2267                 /*
2268                  * Keep a reference on sk_wmem_alloc, this will be released
2269                  * after sk_write_space() call
2270                  */
2271                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2272                 sk->sk_write_space(sk);
2273                 len = 1;
2274         }
2275         /*
2276          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2277          * could not do because of in-flight packets
2278          */
2279         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2280                 __sk_free(sk);
2281 }
2282 EXPORT_SYMBOL(sock_wfree);
2283
2284 /* This variant of sock_wfree() is used by TCP,
2285  * since it sets SOCK_USE_WRITE_QUEUE.
2286  */
2287 void __sock_wfree(struct sk_buff *skb)
2288 {
2289         struct sock *sk = skb->sk;
2290
2291         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2292                 __sk_free(sk);
2293 }
2294
2295 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2296 {
2297         skb_orphan(skb);
2298         skb->sk = sk;
2299 #ifdef CONFIG_INET
2300         if (unlikely(!sk_fullsock(sk))) {
2301                 skb->destructor = sock_edemux;
2302                 sock_hold(sk);
2303                 return;
2304         }
2305 #endif
2306         skb->destructor = sock_wfree;
2307         skb_set_hash_from_sk(skb, sk);
2308         /*
2309          * We used to take a refcount on sk, but following operation
2310          * is enough to guarantee sk_free() wont free this sock until
2311          * all in-flight packets are completed
2312          */
2313         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2314 }
2315 EXPORT_SYMBOL(skb_set_owner_w);
2316
2317 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2318 {
2319 #ifdef CONFIG_TLS_DEVICE
2320         /* Drivers depend on in-order delivery for crypto offload,
2321          * partial orphan breaks out-of-order-OK logic.
2322          */
2323         if (skb->decrypted)
2324                 return false;
2325 #endif
2326         return (skb->destructor == sock_wfree ||
2327                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2328 }
2329
2330 /* This helper is used by netem, as it can hold packets in its
2331  * delay queue. We want to allow the owner socket to send more
2332  * packets, as if they were already TX completed by a typical driver.
2333  * But we also want to keep skb->sk set because some packet schedulers
2334  * rely on it (sch_fq for example).
2335  */
2336 void skb_orphan_partial(struct sk_buff *skb)
2337 {
2338         if (skb_is_tcp_pure_ack(skb))
2339                 return;
2340
2341         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2342                 return;
2343
2344         skb_orphan(skb);
2345 }
2346 EXPORT_SYMBOL(skb_orphan_partial);
2347
2348 /*
2349  * Read buffer destructor automatically called from kfree_skb.
2350  */
2351 void sock_rfree(struct sk_buff *skb)
2352 {
2353         struct sock *sk = skb->sk;
2354         unsigned int len = skb->truesize;
2355
2356         atomic_sub(len, &sk->sk_rmem_alloc);
2357         sk_mem_uncharge(sk, len);
2358 }
2359 EXPORT_SYMBOL(sock_rfree);
2360
2361 /*
2362  * Buffer destructor for skbs that are not used directly in read or write
2363  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2364  */
2365 void sock_efree(struct sk_buff *skb)
2366 {
2367         sock_put(skb->sk);
2368 }
2369 EXPORT_SYMBOL(sock_efree);
2370
2371 /* Buffer destructor for prefetch/receive path where reference count may
2372  * not be held, e.g. for listen sockets.
2373  */
2374 #ifdef CONFIG_INET
2375 void sock_pfree(struct sk_buff *skb)
2376 {
2377         if (sk_is_refcounted(skb->sk))
2378                 sock_gen_put(skb->sk);
2379 }
2380 EXPORT_SYMBOL(sock_pfree);
2381 #endif /* CONFIG_INET */
2382
2383 kuid_t sock_i_uid(struct sock *sk)
2384 {
2385         kuid_t uid;
2386
2387         read_lock_bh(&sk->sk_callback_lock);
2388         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2389         read_unlock_bh(&sk->sk_callback_lock);
2390         return uid;
2391 }
2392 EXPORT_SYMBOL(sock_i_uid);
2393
2394 unsigned long sock_i_ino(struct sock *sk)
2395 {
2396         unsigned long ino;
2397
2398         read_lock_bh(&sk->sk_callback_lock);
2399         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2400         read_unlock_bh(&sk->sk_callback_lock);
2401         return ino;
2402 }
2403 EXPORT_SYMBOL(sock_i_ino);
2404
2405 /*
2406  * Allocate a skb from the socket's send buffer.
2407  */
2408 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2409                              gfp_t priority)
2410 {
2411         if (force ||
2412             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2413                 struct sk_buff *skb = alloc_skb(size, priority);
2414
2415                 if (skb) {
2416                         skb_set_owner_w(skb, sk);
2417                         return skb;
2418                 }
2419         }
2420         return NULL;
2421 }
2422 EXPORT_SYMBOL(sock_wmalloc);
2423
2424 static void sock_ofree(struct sk_buff *skb)
2425 {
2426         struct sock *sk = skb->sk;
2427
2428         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2429 }
2430
2431 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2432                              gfp_t priority)
2433 {
2434         struct sk_buff *skb;
2435
2436         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2437         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2438             sysctl_optmem_max)
2439                 return NULL;
2440
2441         skb = alloc_skb(size, priority);
2442         if (!skb)
2443                 return NULL;
2444
2445         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2446         skb->sk = sk;
2447         skb->destructor = sock_ofree;
2448         return skb;
2449 }
2450
2451 /*
2452  * Allocate a memory block from the socket's option memory buffer.
2453  */
2454 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2455 {
2456         if ((unsigned int)size <= sysctl_optmem_max &&
2457             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2458                 void *mem;
2459                 /* First do the add, to avoid the race if kmalloc
2460                  * might sleep.
2461                  */
2462                 atomic_add(size, &sk->sk_omem_alloc);
2463                 mem = kmalloc(size, priority);
2464                 if (mem)
2465                         return mem;
2466                 atomic_sub(size, &sk->sk_omem_alloc);
2467         }
2468         return NULL;
2469 }
2470 EXPORT_SYMBOL(sock_kmalloc);
2471
2472 /* Free an option memory block. Note, we actually want the inline
2473  * here as this allows gcc to detect the nullify and fold away the
2474  * condition entirely.
2475  */
2476 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2477                                   const bool nullify)
2478 {
2479         if (WARN_ON_ONCE(!mem))
2480                 return;
2481         if (nullify)
2482                 kfree_sensitive(mem);
2483         else
2484                 kfree(mem);
2485         atomic_sub(size, &sk->sk_omem_alloc);
2486 }
2487
2488 void sock_kfree_s(struct sock *sk, void *mem, int size)
2489 {
2490         __sock_kfree_s(sk, mem, size, false);
2491 }
2492 EXPORT_SYMBOL(sock_kfree_s);
2493
2494 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2495 {
2496         __sock_kfree_s(sk, mem, size, true);
2497 }
2498 EXPORT_SYMBOL(sock_kzfree_s);
2499
2500 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2501    I think, these locks should be removed for datagram sockets.
2502  */
2503 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2504 {
2505         DEFINE_WAIT(wait);
2506
2507         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2508         for (;;) {
2509                 if (!timeo)
2510                         break;
2511                 if (signal_pending(current))
2512                         break;
2513                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2514                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2515                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2516                         break;
2517                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2518                         break;
2519                 if (sk->sk_err)
2520                         break;
2521                 timeo = schedule_timeout(timeo);
2522         }
2523         finish_wait(sk_sleep(sk), &wait);
2524         return timeo;
2525 }
2526
2527
2528 /*
2529  *      Generic send/receive buffer handlers
2530  */
2531
2532 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2533                                      unsigned long data_len, int noblock,
2534                                      int *errcode, int max_page_order)
2535 {
2536         struct sk_buff *skb;
2537         long timeo;
2538         int err;
2539
2540         timeo = sock_sndtimeo(sk, noblock);
2541         for (;;) {
2542                 err = sock_error(sk);
2543                 if (err != 0)
2544                         goto failure;
2545
2546                 err = -EPIPE;
2547                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2548                         goto failure;
2549
2550                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2551                         break;
2552
2553                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2554                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2555                 err = -EAGAIN;
2556                 if (!timeo)
2557                         goto failure;
2558                 if (signal_pending(current))
2559                         goto interrupted;
2560                 timeo = sock_wait_for_wmem(sk, timeo);
2561         }
2562         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2563                                    errcode, sk->sk_allocation);
2564         if (skb)
2565                 skb_set_owner_w(skb, sk);
2566         return skb;
2567
2568 interrupted:
2569         err = sock_intr_errno(timeo);
2570 failure:
2571         *errcode = err;
2572         return NULL;
2573 }
2574 EXPORT_SYMBOL(sock_alloc_send_pskb);
2575
2576 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2577                                     int noblock, int *errcode)
2578 {
2579         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2580 }
2581 EXPORT_SYMBOL(sock_alloc_send_skb);
2582
2583 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2584                      struct sockcm_cookie *sockc)
2585 {
2586         u32 tsflags;
2587
2588         switch (cmsg->cmsg_type) {
2589         case SO_MARK:
2590                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2591                         return -EPERM;
2592                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2593                         return -EINVAL;
2594                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2595                 break;
2596         case SO_TIMESTAMPING_OLD:
2597                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2598                         return -EINVAL;
2599
2600                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2601                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2602                         return -EINVAL;
2603
2604                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2605                 sockc->tsflags |= tsflags;
2606                 break;
2607         case SCM_TXTIME:
2608                 if (!sock_flag(sk, SOCK_TXTIME))
2609                         return -EINVAL;
2610                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2611                         return -EINVAL;
2612                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2613                 break;
2614         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2615         case SCM_RIGHTS:
2616         case SCM_CREDENTIALS:
2617                 break;
2618         default:
2619                 return -EINVAL;
2620         }
2621         return 0;
2622 }
2623 EXPORT_SYMBOL(__sock_cmsg_send);
2624
2625 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2626                    struct sockcm_cookie *sockc)
2627 {
2628         struct cmsghdr *cmsg;
2629         int ret;
2630
2631         for_each_cmsghdr(cmsg, msg) {
2632                 if (!CMSG_OK(msg, cmsg))
2633                         return -EINVAL;
2634                 if (cmsg->cmsg_level != SOL_SOCKET)
2635                         continue;
2636                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2637                 if (ret)
2638                         return ret;
2639         }
2640         return 0;
2641 }
2642 EXPORT_SYMBOL(sock_cmsg_send);
2643
2644 static void sk_enter_memory_pressure(struct sock *sk)
2645 {
2646         if (!sk->sk_prot->enter_memory_pressure)
2647                 return;
2648
2649         sk->sk_prot->enter_memory_pressure(sk);
2650 }
2651
2652 static void sk_leave_memory_pressure(struct sock *sk)
2653 {
2654         if (sk->sk_prot->leave_memory_pressure) {
2655                 sk->sk_prot->leave_memory_pressure(sk);
2656         } else {
2657                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2658
2659                 if (memory_pressure && READ_ONCE(*memory_pressure))
2660                         WRITE_ONCE(*memory_pressure, 0);
2661         }
2662 }
2663
2664 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2665
2666 /**
2667  * skb_page_frag_refill - check that a page_frag contains enough room
2668  * @sz: minimum size of the fragment we want to get
2669  * @pfrag: pointer to page_frag
2670  * @gfp: priority for memory allocation
2671  *
2672  * Note: While this allocator tries to use high order pages, there is
2673  * no guarantee that allocations succeed. Therefore, @sz MUST be
2674  * less or equal than PAGE_SIZE.
2675  */
2676 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2677 {
2678         if (pfrag->page) {
2679                 if (page_ref_count(pfrag->page) == 1) {
2680                         pfrag->offset = 0;
2681                         return true;
2682                 }
2683                 if (pfrag->offset + sz <= pfrag->size)
2684                         return true;
2685                 put_page(pfrag->page);
2686         }
2687
2688         pfrag->offset = 0;
2689         if (SKB_FRAG_PAGE_ORDER &&
2690             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2691                 /* Avoid direct reclaim but allow kswapd to wake */
2692                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2693                                           __GFP_COMP | __GFP_NOWARN |
2694                                           __GFP_NORETRY,
2695                                           SKB_FRAG_PAGE_ORDER);
2696                 if (likely(pfrag->page)) {
2697                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2698                         return true;
2699                 }
2700         }
2701         pfrag->page = alloc_page(gfp);
2702         if (likely(pfrag->page)) {
2703                 pfrag->size = PAGE_SIZE;
2704                 return true;
2705         }
2706         return false;
2707 }
2708 EXPORT_SYMBOL(skb_page_frag_refill);
2709
2710 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2711 {
2712         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2713                 return true;
2714
2715         sk_enter_memory_pressure(sk);
2716         sk_stream_moderate_sndbuf(sk);
2717         return false;
2718 }
2719 EXPORT_SYMBOL(sk_page_frag_refill);
2720
2721 void __lock_sock(struct sock *sk)
2722         __releases(&sk->sk_lock.slock)
2723         __acquires(&sk->sk_lock.slock)
2724 {
2725         DEFINE_WAIT(wait);
2726
2727         for (;;) {
2728                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2729                                         TASK_UNINTERRUPTIBLE);
2730                 spin_unlock_bh(&sk->sk_lock.slock);
2731                 schedule();
2732                 spin_lock_bh(&sk->sk_lock.slock);
2733                 if (!sock_owned_by_user(sk))
2734                         break;
2735         }
2736         finish_wait(&sk->sk_lock.wq, &wait);
2737 }
2738
2739 void __release_sock(struct sock *sk)
2740         __releases(&sk->sk_lock.slock)
2741         __acquires(&sk->sk_lock.slock)
2742 {
2743         struct sk_buff *skb, *next;
2744
2745         while ((skb = sk->sk_backlog.head) != NULL) {
2746                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2747
2748                 spin_unlock_bh(&sk->sk_lock.slock);
2749
2750                 do {
2751                         next = skb->next;
2752                         prefetch(next);
2753                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2754                         skb_mark_not_on_list(skb);
2755                         sk_backlog_rcv(sk, skb);
2756
2757                         cond_resched();
2758
2759                         skb = next;
2760                 } while (skb != NULL);
2761
2762                 spin_lock_bh(&sk->sk_lock.slock);
2763         }
2764
2765         /*
2766          * Doing the zeroing here guarantee we can not loop forever
2767          * while a wild producer attempts to flood us.
2768          */
2769         sk->sk_backlog.len = 0;
2770 }
2771
2772 void __sk_flush_backlog(struct sock *sk)
2773 {
2774         spin_lock_bh(&sk->sk_lock.slock);
2775         __release_sock(sk);
2776         spin_unlock_bh(&sk->sk_lock.slock);
2777 }
2778
2779 /**
2780  * sk_wait_data - wait for data to arrive at sk_receive_queue
2781  * @sk:    sock to wait on
2782  * @timeo: for how long
2783  * @skb:   last skb seen on sk_receive_queue
2784  *
2785  * Now socket state including sk->sk_err is changed only under lock,
2786  * hence we may omit checks after joining wait queue.
2787  * We check receive queue before schedule() only as optimization;
2788  * it is very likely that release_sock() added new data.
2789  */
2790 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2791 {
2792         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2793         int rc;
2794
2795         add_wait_queue(sk_sleep(sk), &wait);
2796         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2797         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2798         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2799         remove_wait_queue(sk_sleep(sk), &wait);
2800         return rc;
2801 }
2802 EXPORT_SYMBOL(sk_wait_data);
2803
2804 /**
2805  *      __sk_mem_raise_allocated - increase memory_allocated
2806  *      @sk: socket
2807  *      @size: memory size to allocate
2808  *      @amt: pages to allocate
2809  *      @kind: allocation type
2810  *
2811  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2812  */
2813 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2814 {
2815         struct proto *prot = sk->sk_prot;
2816         long allocated = sk_memory_allocated_add(sk, amt);
2817         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2818         bool charged = true;
2819
2820         if (memcg_charge &&
2821             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2822                                                 gfp_memcg_charge())))
2823                 goto suppress_allocation;
2824
2825         /* Under limit. */
2826         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2827                 sk_leave_memory_pressure(sk);
2828                 return 1;
2829         }
2830
2831         /* Under pressure. */
2832         if (allocated > sk_prot_mem_limits(sk, 1))
2833                 sk_enter_memory_pressure(sk);
2834
2835         /* Over hard limit. */
2836         if (allocated > sk_prot_mem_limits(sk, 2))
2837                 goto suppress_allocation;
2838
2839         /* guarantee minimum buffer size under pressure */
2840         if (kind == SK_MEM_RECV) {
2841                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2842                         return 1;
2843
2844         } else { /* SK_MEM_SEND */
2845                 int wmem0 = sk_get_wmem0(sk, prot);
2846
2847                 if (sk->sk_type == SOCK_STREAM) {
2848                         if (sk->sk_wmem_queued < wmem0)
2849                                 return 1;
2850                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2851                                 return 1;
2852                 }
2853         }
2854
2855         if (sk_has_memory_pressure(sk)) {
2856                 u64 alloc;
2857
2858                 if (!sk_under_memory_pressure(sk))
2859                         return 1;
2860                 alloc = sk_sockets_allocated_read_positive(sk);
2861                 if (sk_prot_mem_limits(sk, 2) > alloc *
2862                     sk_mem_pages(sk->sk_wmem_queued +
2863                                  atomic_read(&sk->sk_rmem_alloc) +
2864                                  sk->sk_forward_alloc))
2865                         return 1;
2866         }
2867
2868 suppress_allocation:
2869
2870         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2871                 sk_stream_moderate_sndbuf(sk);
2872
2873                 /* Fail only if socket is _under_ its sndbuf.
2874                  * In this case we cannot block, so that we have to fail.
2875                  */
2876                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2877                         /* Force charge with __GFP_NOFAIL */
2878                         if (memcg_charge && !charged) {
2879                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2880                                         gfp_memcg_charge() | __GFP_NOFAIL);
2881                         }
2882                         return 1;
2883                 }
2884         }
2885
2886         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2887                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2888
2889         sk_memory_allocated_sub(sk, amt);
2890
2891         if (memcg_charge && charged)
2892                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2893
2894         return 0;
2895 }
2896 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2897
2898 /**
2899  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2900  *      @sk: socket
2901  *      @size: memory size to allocate
2902  *      @kind: allocation type
2903  *
2904  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2905  *      rmem allocation. This function assumes that protocols which have
2906  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2907  */
2908 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2909 {
2910         int ret, amt = sk_mem_pages(size);
2911
2912         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2913         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2914         if (!ret)
2915                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2916         return ret;
2917 }
2918 EXPORT_SYMBOL(__sk_mem_schedule);
2919
2920 /**
2921  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2922  *      @sk: socket
2923  *      @amount: number of quanta
2924  *
2925  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2926  */
2927 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2928 {
2929         sk_memory_allocated_sub(sk, amount);
2930
2931         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2932                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2933
2934         if (sk_under_memory_pressure(sk) &&
2935             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2936                 sk_leave_memory_pressure(sk);
2937 }
2938 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2939
2940 /**
2941  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2942  *      @sk: socket
2943  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2944  */
2945 void __sk_mem_reclaim(struct sock *sk, int amount)
2946 {
2947         amount >>= SK_MEM_QUANTUM_SHIFT;
2948         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2949         __sk_mem_reduce_allocated(sk, amount);
2950 }
2951 EXPORT_SYMBOL(__sk_mem_reclaim);
2952
2953 int sk_set_peek_off(struct sock *sk, int val)
2954 {
2955         sk->sk_peek_off = val;
2956         return 0;
2957 }
2958 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2959
2960 /*
2961  * Set of default routines for initialising struct proto_ops when
2962  * the protocol does not support a particular function. In certain
2963  * cases where it makes no sense for a protocol to have a "do nothing"
2964  * function, some default processing is provided.
2965  */
2966
2967 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2968 {
2969         return -EOPNOTSUPP;
2970 }
2971 EXPORT_SYMBOL(sock_no_bind);
2972
2973 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2974                     int len, int flags)
2975 {
2976         return -EOPNOTSUPP;
2977 }
2978 EXPORT_SYMBOL(sock_no_connect);
2979
2980 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2981 {
2982         return -EOPNOTSUPP;
2983 }
2984 EXPORT_SYMBOL(sock_no_socketpair);
2985
2986 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2987                    bool kern)
2988 {
2989         return -EOPNOTSUPP;
2990 }
2991 EXPORT_SYMBOL(sock_no_accept);
2992
2993 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2994                     int peer)
2995 {
2996         return -EOPNOTSUPP;
2997 }
2998 EXPORT_SYMBOL(sock_no_getname);
2999
3000 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3001 {
3002         return -EOPNOTSUPP;
3003 }
3004 EXPORT_SYMBOL(sock_no_ioctl);
3005
3006 int sock_no_listen(struct socket *sock, int backlog)
3007 {
3008         return -EOPNOTSUPP;
3009 }
3010 EXPORT_SYMBOL(sock_no_listen);
3011
3012 int sock_no_shutdown(struct socket *sock, int how)
3013 {
3014         return -EOPNOTSUPP;
3015 }
3016 EXPORT_SYMBOL(sock_no_shutdown);
3017
3018 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3019 {
3020         return -EOPNOTSUPP;
3021 }
3022 EXPORT_SYMBOL(sock_no_sendmsg);
3023
3024 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3025 {
3026         return -EOPNOTSUPP;
3027 }
3028 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3029
3030 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3031                     int flags)
3032 {
3033         return -EOPNOTSUPP;
3034 }
3035 EXPORT_SYMBOL(sock_no_recvmsg);
3036
3037 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3038 {
3039         /* Mirror missing mmap method error code */
3040         return -ENODEV;
3041 }
3042 EXPORT_SYMBOL(sock_no_mmap);
3043
3044 /*
3045  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3046  * various sock-based usage counts.
3047  */
3048 void __receive_sock(struct file *file)
3049 {
3050         struct socket *sock;
3051
3052         sock = sock_from_file(file);
3053         if (sock) {
3054                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3055                 sock_update_classid(&sock->sk->sk_cgrp_data);
3056         }
3057 }
3058
3059 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3060 {
3061         ssize_t res;
3062         struct msghdr msg = {.msg_flags = flags};
3063         struct kvec iov;
3064         char *kaddr = kmap(page);
3065         iov.iov_base = kaddr + offset;
3066         iov.iov_len = size;
3067         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3068         kunmap(page);
3069         return res;
3070 }
3071 EXPORT_SYMBOL(sock_no_sendpage);
3072
3073 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3074                                 int offset, size_t size, int flags)
3075 {
3076         ssize_t res;
3077         struct msghdr msg = {.msg_flags = flags};
3078         struct kvec iov;
3079         char *kaddr = kmap(page);
3080
3081         iov.iov_base = kaddr + offset;
3082         iov.iov_len = size;
3083         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3084         kunmap(page);
3085         return res;
3086 }
3087 EXPORT_SYMBOL(sock_no_sendpage_locked);
3088
3089 /*
3090  *      Default Socket Callbacks
3091  */
3092
3093 static void sock_def_wakeup(struct sock *sk)
3094 {
3095         struct socket_wq *wq;
3096
3097         rcu_read_lock();
3098         wq = rcu_dereference(sk->sk_wq);
3099         if (skwq_has_sleeper(wq))
3100                 wake_up_interruptible_all(&wq->wait);
3101         rcu_read_unlock();
3102 }
3103
3104 static void sock_def_error_report(struct sock *sk)
3105 {
3106         struct socket_wq *wq;
3107
3108         rcu_read_lock();
3109         wq = rcu_dereference(sk->sk_wq);
3110         if (skwq_has_sleeper(wq))
3111                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3112         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3113         rcu_read_unlock();
3114 }
3115
3116 void sock_def_readable(struct sock *sk)
3117 {
3118         struct socket_wq *wq;
3119
3120         rcu_read_lock();
3121         wq = rcu_dereference(sk->sk_wq);
3122         if (skwq_has_sleeper(wq))
3123                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3124                                                 EPOLLRDNORM | EPOLLRDBAND);
3125         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3126         rcu_read_unlock();
3127 }
3128
3129 static void sock_def_write_space(struct sock *sk)
3130 {
3131         struct socket_wq *wq;
3132
3133         rcu_read_lock();
3134
3135         /* Do not wake up a writer until he can make "significant"
3136          * progress.  --DaveM
3137          */
3138         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3139                 wq = rcu_dereference(sk->sk_wq);
3140                 if (skwq_has_sleeper(wq))
3141                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3142                                                 EPOLLWRNORM | EPOLLWRBAND);
3143
3144                 /* Should agree with poll, otherwise some programs break */
3145                 if (sock_writeable(sk))
3146                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3147         }
3148
3149         rcu_read_unlock();
3150 }
3151
3152 static void sock_def_destruct(struct sock *sk)
3153 {
3154 }
3155
3156 void sk_send_sigurg(struct sock *sk)
3157 {
3158         if (sk->sk_socket && sk->sk_socket->file)
3159                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3160                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3161 }
3162 EXPORT_SYMBOL(sk_send_sigurg);
3163
3164 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3165                     unsigned long expires)
3166 {
3167         if (!mod_timer(timer, expires))
3168                 sock_hold(sk);
3169 }
3170 EXPORT_SYMBOL(sk_reset_timer);
3171
3172 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3173 {
3174         if (del_timer(timer))
3175                 __sock_put(sk);
3176 }
3177 EXPORT_SYMBOL(sk_stop_timer);
3178
3179 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3180 {
3181         if (del_timer_sync(timer))
3182                 __sock_put(sk);
3183 }
3184 EXPORT_SYMBOL(sk_stop_timer_sync);
3185
3186 void sock_init_data(struct socket *sock, struct sock *sk)
3187 {
3188         sk_init_common(sk);
3189         sk->sk_send_head        =       NULL;
3190
3191         timer_setup(&sk->sk_timer, NULL, 0);
3192
3193         sk->sk_allocation       =       GFP_KERNEL;
3194         sk->sk_rcvbuf           =       sysctl_rmem_default;
3195         sk->sk_sndbuf           =       sysctl_wmem_default;
3196         sk->sk_state            =       TCP_CLOSE;
3197         sk_set_socket(sk, sock);
3198
3199         sock_set_flag(sk, SOCK_ZAPPED);
3200
3201         if (sock) {
3202                 sk->sk_type     =       sock->type;
3203                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3204                 sock->sk        =       sk;
3205                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3206         } else {
3207                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3208                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3209         }
3210
3211         rwlock_init(&sk->sk_callback_lock);
3212         if (sk->sk_kern_sock)
3213                 lockdep_set_class_and_name(
3214                         &sk->sk_callback_lock,
3215                         af_kern_callback_keys + sk->sk_family,
3216                         af_family_kern_clock_key_strings[sk->sk_family]);
3217         else
3218                 lockdep_set_class_and_name(
3219                         &sk->sk_callback_lock,
3220                         af_callback_keys + sk->sk_family,
3221                         af_family_clock_key_strings[sk->sk_family]);
3222
3223         sk->sk_state_change     =       sock_def_wakeup;
3224         sk->sk_data_ready       =       sock_def_readable;
3225         sk->sk_write_space      =       sock_def_write_space;
3226         sk->sk_error_report     =       sock_def_error_report;
3227         sk->sk_destruct         =       sock_def_destruct;
3228
3229         sk->sk_frag.page        =       NULL;
3230         sk->sk_frag.offset      =       0;
3231         sk->sk_peek_off         =       -1;
3232
3233         sk->sk_peer_pid         =       NULL;
3234         sk->sk_peer_cred        =       NULL;
3235         spin_lock_init(&sk->sk_peer_lock);
3236
3237         sk->sk_write_pending    =       0;
3238         sk->sk_rcvlowat         =       1;
3239         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3240         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3241
3242         sk->sk_stamp = SK_DEFAULT_STAMP;
3243 #if BITS_PER_LONG==32
3244         seqlock_init(&sk->sk_stamp_seq);
3245 #endif
3246         atomic_set(&sk->sk_zckey, 0);
3247
3248 #ifdef CONFIG_NET_RX_BUSY_POLL
3249         sk->sk_napi_id          =       0;
3250         sk->sk_ll_usec          =       sysctl_net_busy_read;
3251 #endif
3252
3253         sk->sk_max_pacing_rate = ~0UL;
3254         sk->sk_pacing_rate = ~0UL;
3255         WRITE_ONCE(sk->sk_pacing_shift, 10);
3256         sk->sk_incoming_cpu = -1;
3257
3258         sk_rx_queue_clear(sk);
3259         /*
3260          * Before updating sk_refcnt, we must commit prior changes to memory
3261          * (Documentation/RCU/rculist_nulls.rst for details)
3262          */
3263         smp_wmb();
3264         refcount_set(&sk->sk_refcnt, 1);
3265         atomic_set(&sk->sk_drops, 0);
3266 }
3267 EXPORT_SYMBOL(sock_init_data);
3268
3269 void lock_sock_nested(struct sock *sk, int subclass)
3270 {
3271         /* The sk_lock has mutex_lock() semantics here. */
3272         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3273
3274         might_sleep();
3275         spin_lock_bh(&sk->sk_lock.slock);
3276         if (sk->sk_lock.owned)
3277                 __lock_sock(sk);
3278         sk->sk_lock.owned = 1;
3279         spin_unlock_bh(&sk->sk_lock.slock);
3280 }
3281 EXPORT_SYMBOL(lock_sock_nested);
3282
3283 void release_sock(struct sock *sk)
3284 {
3285         spin_lock_bh(&sk->sk_lock.slock);
3286         if (sk->sk_backlog.tail)
3287                 __release_sock(sk);
3288
3289         /* Warning : release_cb() might need to release sk ownership,
3290          * ie call sock_release_ownership(sk) before us.
3291          */
3292         if (sk->sk_prot->release_cb)
3293                 sk->sk_prot->release_cb(sk);
3294
3295         sock_release_ownership(sk);
3296         if (waitqueue_active(&sk->sk_lock.wq))
3297                 wake_up(&sk->sk_lock.wq);
3298         spin_unlock_bh(&sk->sk_lock.slock);
3299 }
3300 EXPORT_SYMBOL(release_sock);
3301
3302 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3303 {
3304         might_sleep();
3305         spin_lock_bh(&sk->sk_lock.slock);
3306
3307         if (!sk->sk_lock.owned) {
3308                 /*
3309                  * Fast path return with bottom halves disabled and
3310                  * sock::sk_lock.slock held.
3311                  *
3312                  * The 'mutex' is not contended and holding
3313                  * sock::sk_lock.slock prevents all other lockers to
3314                  * proceed so the corresponding unlock_sock_fast() can
3315                  * avoid the slow path of release_sock() completely and
3316                  * just release slock.
3317                  *
3318                  * From a semantical POV this is equivalent to 'acquiring'
3319                  * the 'mutex', hence the corresponding lockdep
3320                  * mutex_release() has to happen in the fast path of
3321                  * unlock_sock_fast().
3322                  */
3323                 return false;
3324         }
3325
3326         __lock_sock(sk);
3327         sk->sk_lock.owned = 1;
3328         __acquire(&sk->sk_lock.slock);
3329         spin_unlock_bh(&sk->sk_lock.slock);
3330         return true;
3331 }
3332 EXPORT_SYMBOL(__lock_sock_fast);
3333
3334 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3335                    bool timeval, bool time32)
3336 {
3337         struct sock *sk = sock->sk;
3338         struct timespec64 ts;
3339
3340         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3341         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3342         if (ts.tv_sec == -1)
3343                 return -ENOENT;
3344         if (ts.tv_sec == 0) {
3345                 ktime_t kt = ktime_get_real();
3346                 sock_write_timestamp(sk, kt);
3347                 ts = ktime_to_timespec64(kt);
3348         }
3349
3350         if (timeval)
3351                 ts.tv_nsec /= 1000;
3352
3353 #ifdef CONFIG_COMPAT_32BIT_TIME
3354         if (time32)
3355                 return put_old_timespec32(&ts, userstamp);
3356 #endif
3357 #ifdef CONFIG_SPARC64
3358         /* beware of padding in sparc64 timeval */
3359         if (timeval && !in_compat_syscall()) {
3360                 struct __kernel_old_timeval __user tv = {
3361                         .tv_sec = ts.tv_sec,
3362                         .tv_usec = ts.tv_nsec,
3363                 };
3364                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3365                         return -EFAULT;
3366                 return 0;
3367         }
3368 #endif
3369         return put_timespec64(&ts, userstamp);
3370 }
3371 EXPORT_SYMBOL(sock_gettstamp);
3372
3373 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3374 {
3375         if (!sock_flag(sk, flag)) {
3376                 unsigned long previous_flags = sk->sk_flags;
3377
3378                 sock_set_flag(sk, flag);
3379                 /*
3380                  * we just set one of the two flags which require net
3381                  * time stamping, but time stamping might have been on
3382                  * already because of the other one
3383                  */
3384                 if (sock_needs_netstamp(sk) &&
3385                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3386                         net_enable_timestamp();
3387         }
3388 }
3389
3390 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3391                        int level, int type)
3392 {
3393         struct sock_exterr_skb *serr;
3394         struct sk_buff *skb;
3395         int copied, err;
3396
3397         err = -EAGAIN;
3398         skb = sock_dequeue_err_skb(sk);
3399         if (skb == NULL)
3400                 goto out;
3401
3402         copied = skb->len;
3403         if (copied > len) {
3404                 msg->msg_flags |= MSG_TRUNC;
3405                 copied = len;
3406         }
3407         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3408         if (err)
3409                 goto out_free_skb;
3410
3411         sock_recv_timestamp(msg, sk, skb);
3412
3413         serr = SKB_EXT_ERR(skb);
3414         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3415
3416         msg->msg_flags |= MSG_ERRQUEUE;
3417         err = copied;
3418
3419 out_free_skb:
3420         kfree_skb(skb);
3421 out:
3422         return err;
3423 }
3424 EXPORT_SYMBOL(sock_recv_errqueue);
3425
3426 /*
3427  *      Get a socket option on an socket.
3428  *
3429  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3430  *      asynchronous errors should be reported by getsockopt. We assume
3431  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3432  */
3433 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3434                            char __user *optval, int __user *optlen)
3435 {
3436         struct sock *sk = sock->sk;
3437
3438         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3439 }
3440 EXPORT_SYMBOL(sock_common_getsockopt);
3441
3442 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3443                         int flags)
3444 {
3445         struct sock *sk = sock->sk;
3446         int addr_len = 0;
3447         int err;
3448
3449         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3450                                    flags & ~MSG_DONTWAIT, &addr_len);
3451         if (err >= 0)
3452                 msg->msg_namelen = addr_len;
3453         return err;
3454 }
3455 EXPORT_SYMBOL(sock_common_recvmsg);
3456
3457 /*
3458  *      Set socket options on an inet socket.
3459  */
3460 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3461                            sockptr_t optval, unsigned int optlen)
3462 {
3463         struct sock *sk = sock->sk;
3464
3465         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3466 }
3467 EXPORT_SYMBOL(sock_common_setsockopt);
3468
3469 void sk_common_release(struct sock *sk)
3470 {
3471         if (sk->sk_prot->destroy)
3472                 sk->sk_prot->destroy(sk);
3473
3474         /*
3475          * Observation: when sk_common_release is called, processes have
3476          * no access to socket. But net still has.
3477          * Step one, detach it from networking:
3478          *
3479          * A. Remove from hash tables.
3480          */
3481
3482         sk->sk_prot->unhash(sk);
3483
3484         /*
3485          * In this point socket cannot receive new packets, but it is possible
3486          * that some packets are in flight because some CPU runs receiver and
3487          * did hash table lookup before we unhashed socket. They will achieve
3488          * receive queue and will be purged by socket destructor.
3489          *
3490          * Also we still have packets pending on receive queue and probably,
3491          * our own packets waiting in device queues. sock_destroy will drain
3492          * receive queue, but transmitted packets will delay socket destruction
3493          * until the last reference will be released.
3494          */
3495
3496         sock_orphan(sk);
3497
3498         xfrm_sk_free_policy(sk);
3499
3500         sk_refcnt_debug_release(sk);
3501
3502         sock_put(sk);
3503 }
3504 EXPORT_SYMBOL(sk_common_release);
3505
3506 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3507 {
3508         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3509
3510         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3511         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3512         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3513         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3514         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3515         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3516         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3517         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3518         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3519 }
3520
3521 #ifdef CONFIG_PROC_FS
3522 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3523 struct prot_inuse {
3524         int val[PROTO_INUSE_NR];
3525 };
3526
3527 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3528
3529 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3530 {
3531         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3532 }
3533 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3534
3535 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3536 {
3537         int cpu, idx = prot->inuse_idx;
3538         int res = 0;
3539
3540         for_each_possible_cpu(cpu)
3541                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3542
3543         return res >= 0 ? res : 0;
3544 }
3545 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3546
3547 static void sock_inuse_add(struct net *net, int val)
3548 {
3549         this_cpu_add(*net->core.sock_inuse, val);
3550 }
3551
3552 int sock_inuse_get(struct net *net)
3553 {
3554         int cpu, res = 0;
3555
3556         for_each_possible_cpu(cpu)
3557                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3558
3559         return res;
3560 }
3561
3562 EXPORT_SYMBOL_GPL(sock_inuse_get);
3563
3564 static int __net_init sock_inuse_init_net(struct net *net)
3565 {
3566         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3567         if (net->core.prot_inuse == NULL)
3568                 return -ENOMEM;
3569
3570         net->core.sock_inuse = alloc_percpu(int);
3571         if (net->core.sock_inuse == NULL)
3572                 goto out;
3573
3574         return 0;
3575
3576 out:
3577         free_percpu(net->core.prot_inuse);
3578         return -ENOMEM;
3579 }
3580
3581 static void __net_exit sock_inuse_exit_net(struct net *net)
3582 {
3583         free_percpu(net->core.prot_inuse);
3584         free_percpu(net->core.sock_inuse);
3585 }
3586
3587 static struct pernet_operations net_inuse_ops = {
3588         .init = sock_inuse_init_net,
3589         .exit = sock_inuse_exit_net,
3590 };
3591
3592 static __init int net_inuse_init(void)
3593 {
3594         if (register_pernet_subsys(&net_inuse_ops))
3595                 panic("Cannot initialize net inuse counters");
3596
3597         return 0;
3598 }
3599
3600 core_initcall(net_inuse_init);
3601
3602 static int assign_proto_idx(struct proto *prot)
3603 {
3604         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3605
3606         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3607                 pr_err("PROTO_INUSE_NR exhausted\n");
3608                 return -ENOSPC;
3609         }
3610
3611         set_bit(prot->inuse_idx, proto_inuse_idx);
3612         return 0;
3613 }
3614
3615 static void release_proto_idx(struct proto *prot)
3616 {
3617         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3618                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3619 }
3620 #else
3621 static inline int assign_proto_idx(struct proto *prot)
3622 {
3623         return 0;
3624 }
3625
3626 static inline void release_proto_idx(struct proto *prot)
3627 {
3628 }
3629
3630 static void sock_inuse_add(struct net *net, int val)
3631 {
3632 }
3633 #endif
3634
3635 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3636 {
3637         if (!twsk_prot)
3638                 return;
3639         kfree(twsk_prot->twsk_slab_name);
3640         twsk_prot->twsk_slab_name = NULL;
3641         kmem_cache_destroy(twsk_prot->twsk_slab);
3642         twsk_prot->twsk_slab = NULL;
3643 }
3644
3645 static int tw_prot_init(const struct proto *prot)
3646 {
3647         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3648
3649         if (!twsk_prot)
3650                 return 0;
3651
3652         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3653                                               prot->name);
3654         if (!twsk_prot->twsk_slab_name)
3655                 return -ENOMEM;
3656
3657         twsk_prot->twsk_slab =
3658                 kmem_cache_create(twsk_prot->twsk_slab_name,
3659                                   twsk_prot->twsk_obj_size, 0,
3660                                   SLAB_ACCOUNT | prot->slab_flags,
3661                                   NULL);
3662         if (!twsk_prot->twsk_slab) {
3663                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3664                         prot->name);
3665                 return -ENOMEM;
3666         }
3667
3668         return 0;
3669 }
3670
3671 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3672 {
3673         if (!rsk_prot)
3674                 return;
3675         kfree(rsk_prot->slab_name);
3676         rsk_prot->slab_name = NULL;
3677         kmem_cache_destroy(rsk_prot->slab);
3678         rsk_prot->slab = NULL;
3679 }
3680
3681 static int req_prot_init(const struct proto *prot)
3682 {
3683         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3684
3685         if (!rsk_prot)
3686                 return 0;
3687
3688         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3689                                         prot->name);
3690         if (!rsk_prot->slab_name)
3691                 return -ENOMEM;
3692
3693         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3694                                            rsk_prot->obj_size, 0,
3695                                            SLAB_ACCOUNT | prot->slab_flags,
3696                                            NULL);
3697
3698         if (!rsk_prot->slab) {
3699                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3700                         prot->name);
3701                 return -ENOMEM;
3702         }
3703         return 0;
3704 }
3705
3706 int proto_register(struct proto *prot, int alloc_slab)
3707 {
3708         int ret = -ENOBUFS;
3709
3710         if (alloc_slab) {
3711                 prot->slab = kmem_cache_create_usercopy(prot->name,
3712                                         prot->obj_size, 0,
3713                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3714                                         prot->slab_flags,
3715                                         prot->useroffset, prot->usersize,
3716                                         NULL);
3717
3718                 if (prot->slab == NULL) {
3719                         pr_crit("%s: Can't create sock SLAB cache!\n",
3720                                 prot->name);
3721                         goto out;
3722                 }
3723
3724                 if (req_prot_init(prot))
3725                         goto out_free_request_sock_slab;
3726
3727                 if (tw_prot_init(prot))
3728                         goto out_free_timewait_sock_slab;
3729         }
3730
3731         mutex_lock(&proto_list_mutex);
3732         ret = assign_proto_idx(prot);
3733         if (ret) {
3734                 mutex_unlock(&proto_list_mutex);
3735                 goto out_free_timewait_sock_slab;
3736         }
3737         list_add(&prot->node, &proto_list);
3738         mutex_unlock(&proto_list_mutex);
3739         return ret;
3740
3741 out_free_timewait_sock_slab:
3742         if (alloc_slab)
3743                 tw_prot_cleanup(prot->twsk_prot);
3744 out_free_request_sock_slab:
3745         if (alloc_slab) {
3746                 req_prot_cleanup(prot->rsk_prot);
3747
3748                 kmem_cache_destroy(prot->slab);
3749                 prot->slab = NULL;
3750         }
3751 out:
3752         return ret;
3753 }
3754 EXPORT_SYMBOL(proto_register);
3755
3756 void proto_unregister(struct proto *prot)
3757 {
3758         mutex_lock(&proto_list_mutex);
3759         release_proto_idx(prot);
3760         list_del(&prot->node);
3761         mutex_unlock(&proto_list_mutex);
3762
3763         kmem_cache_destroy(prot->slab);
3764         prot->slab = NULL;
3765
3766         req_prot_cleanup(prot->rsk_prot);
3767         tw_prot_cleanup(prot->twsk_prot);
3768 }
3769 EXPORT_SYMBOL(proto_unregister);
3770
3771 int sock_load_diag_module(int family, int protocol)
3772 {
3773         if (!protocol) {
3774                 if (!sock_is_registered(family))
3775                         return -ENOENT;
3776
3777                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3778                                       NETLINK_SOCK_DIAG, family);
3779         }
3780
3781 #ifdef CONFIG_INET
3782         if (family == AF_INET &&
3783             protocol != IPPROTO_RAW &&
3784             protocol < MAX_INET_PROTOS &&
3785             !rcu_access_pointer(inet_protos[protocol]))
3786                 return -ENOENT;
3787 #endif
3788
3789         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3790                               NETLINK_SOCK_DIAG, family, protocol);
3791 }
3792 EXPORT_SYMBOL(sock_load_diag_module);
3793
3794 #ifdef CONFIG_PROC_FS
3795 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3796         __acquires(proto_list_mutex)
3797 {
3798         mutex_lock(&proto_list_mutex);
3799         return seq_list_start_head(&proto_list, *pos);
3800 }
3801
3802 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3803 {
3804         return seq_list_next(v, &proto_list, pos);
3805 }
3806
3807 static void proto_seq_stop(struct seq_file *seq, void *v)
3808         __releases(proto_list_mutex)
3809 {
3810         mutex_unlock(&proto_list_mutex);
3811 }
3812
3813 static char proto_method_implemented(const void *method)
3814 {
3815         return method == NULL ? 'n' : 'y';
3816 }
3817 static long sock_prot_memory_allocated(struct proto *proto)
3818 {
3819         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3820 }
3821
3822 static const char *sock_prot_memory_pressure(struct proto *proto)
3823 {
3824         return proto->memory_pressure != NULL ?
3825         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3826 }
3827
3828 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3829 {
3830
3831         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3832                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3833                    proto->name,
3834                    proto->obj_size,
3835                    sock_prot_inuse_get(seq_file_net(seq), proto),
3836                    sock_prot_memory_allocated(proto),
3837                    sock_prot_memory_pressure(proto),
3838                    proto->max_header,
3839                    proto->slab == NULL ? "no" : "yes",
3840                    module_name(proto->owner),
3841                    proto_method_implemented(proto->close),
3842                    proto_method_implemented(proto->connect),
3843                    proto_method_implemented(proto->disconnect),
3844                    proto_method_implemented(proto->accept),
3845                    proto_method_implemented(proto->ioctl),
3846                    proto_method_implemented(proto->init),
3847                    proto_method_implemented(proto->destroy),
3848                    proto_method_implemented(proto->shutdown),
3849                    proto_method_implemented(proto->setsockopt),
3850                    proto_method_implemented(proto->getsockopt),
3851                    proto_method_implemented(proto->sendmsg),
3852                    proto_method_implemented(proto->recvmsg),
3853                    proto_method_implemented(proto->sendpage),
3854                    proto_method_implemented(proto->bind),
3855                    proto_method_implemented(proto->backlog_rcv),
3856                    proto_method_implemented(proto->hash),
3857                    proto_method_implemented(proto->unhash),
3858                    proto_method_implemented(proto->get_port),
3859                    proto_method_implemented(proto->enter_memory_pressure));
3860 }
3861
3862 static int proto_seq_show(struct seq_file *seq, void *v)
3863 {
3864         if (v == &proto_list)
3865                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3866                            "protocol",
3867                            "size",
3868                            "sockets",
3869                            "memory",
3870                            "press",
3871                            "maxhdr",
3872                            "slab",
3873                            "module",
3874                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3875         else
3876                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3877         return 0;
3878 }
3879
3880 static const struct seq_operations proto_seq_ops = {
3881         .start  = proto_seq_start,
3882         .next   = proto_seq_next,
3883         .stop   = proto_seq_stop,
3884         .show   = proto_seq_show,
3885 };
3886
3887 static __net_init int proto_init_net(struct net *net)
3888 {
3889         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3890                         sizeof(struct seq_net_private)))
3891                 return -ENOMEM;
3892
3893         return 0;
3894 }
3895
3896 static __net_exit void proto_exit_net(struct net *net)
3897 {
3898         remove_proc_entry("protocols", net->proc_net);
3899 }
3900
3901
3902 static __net_initdata struct pernet_operations proto_net_ops = {
3903         .init = proto_init_net,
3904         .exit = proto_exit_net,
3905 };
3906
3907 static int __init proto_init(void)
3908 {
3909         return register_pernet_subsys(&proto_net_ops);
3910 }
3911
3912 subsys_initcall(proto_init);
3913
3914 #endif /* PROC_FS */
3915
3916 #ifdef CONFIG_NET_RX_BUSY_POLL
3917 bool sk_busy_loop_end(void *p, unsigned long start_time)
3918 {
3919         struct sock *sk = p;
3920
3921         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3922                sk_busy_loop_timeout(sk, start_time);
3923 }
3924 EXPORT_SYMBOL(sk_busy_loop_end);
3925 #endif /* CONFIG_NET_RX_BUSY_POLL */
3926
3927 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3928 {
3929         if (!sk->sk_prot->bind_add)
3930                 return -EOPNOTSUPP;
3931         return sk->sk_prot->bind_add(sk, addr, addr_len);
3932 }
3933 EXPORT_SYMBOL(sock_bind_add);