net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #ifdef CONFIG_INET
 142 #include <net/tcp.h>
 143 #endif
 144
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family and separate keys for internal and
 201  * userspace sockets.
 202  */
 203 static struct lock_class_key af_family_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_keys[AF_MAX];
 205 static struct lock_class_key af_family_slock_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 207
 208 /*
 209  * Make lock validator output more readable. (we pre-construct these
 210  * strings build-time, so that runtime initialization of socket
 211  * locks is fast):
 212  */
 213
 214 #define _sock_locks(x)                                            \
 215   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 216   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 217   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 218   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 219   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 220   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 221   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 222   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 223   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 224   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 225   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 226   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 227   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 228   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 229   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 230
 231 static const char *const af_family_key_strings[AF_MAX+1] = {
 232         _sock_locks("sk_lock-")
 233 };
 234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 235         _sock_locks("slock-")
 236 };
 237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 238         _sock_locks("clock-")
 239 };
 240
 241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 242         _sock_locks("k-sk_lock-")
 243 };
 244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 245         _sock_locks("k-slock-")
 246 };
 247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 248         _sock_locks("k-clock-")
 249 };
 250
 251 /*
 252  * sk_callback_lock locking rules are per-address-family,
 253  * so split the lock classes by using a per-AF key:
 254  */
 255 static struct lock_class_key af_callback_keys[AF_MAX];
 256 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 257
 258 /* Take into consideration the size of the struct sk_buff overhead in the
 259  * determination of these values, since that is non-constant across
 260  * platforms.  This makes socket queueing behavior and performance
 261  * not depend upon such differences.
 262  */
 263 #define _SK_MEM_PACKETS         256
 264 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 265 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 266 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 267
 268 /* Run time adjustable parameters. */
 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 270 EXPORT_SYMBOL(sysctl_wmem_max);
 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 272 EXPORT_SYMBOL(sysctl_rmem_max);
 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 275
 276 /* Maximal space eaten by iovec or ancillary data plus some space */
 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 278 EXPORT_SYMBOL(sysctl_optmem_max);
 279
 280 int sysctl_tstamp_allow_data __read_mostly = 1;
 281
 282 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 283 EXPORT_SYMBOL_GPL(memalloc_socks);
 284
 285 /**
 286  * sk_set_memalloc - sets %SOCK_MEMALLOC
 287  * @sk: socket to set it on
 288  *
 289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 290  * It's the responsibility of the admin to adjust min_free_kbytes
 291  * to meet the requirements
 292  */
 293 void sk_set_memalloc(struct sock *sk)
 294 {
 295         sock_set_flag(sk, SOCK_MEMALLOC);
 296         sk->sk_allocation |= __GFP_MEMALLOC;
 297         static_key_slow_inc(&memalloc_socks);
 298 }
 299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 300
 301 void sk_clear_memalloc(struct sock *sk)
 302 {
 303         sock_reset_flag(sk, SOCK_MEMALLOC);
 304         sk->sk_allocation &= ~__GFP_MEMALLOC;
 305         static_key_slow_dec(&memalloc_socks);
 306
 307         /*
 308          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 309          * progress of swapping. SOCK_MEMALLOC may be cleared while
 310          * it has rmem allocations due to the last swapfile being deactivated
 311          * but there is a risk that the socket is unusable due to exceeding
 312          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 313          */
 314         sk_mem_reclaim(sk);
 315 }
 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319 {
 320         int ret;
 321         unsigned long pflags = current->flags;
 322
 323         /* these should have been dropped before queueing */
 324         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326         current->flags |= PF_MEMALLOC;
 327         ret = sk->sk_backlog_rcv(sk, skb);
 328         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 329
 330         return ret;
 331 }
 332 EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 335 {
 336         struct timeval tv;
 337
 338         if (optlen < sizeof(tv))
 339                 return -EINVAL;
 340         if (copy_from_user(&tv, optval, sizeof(tv)))
 341                 return -EFAULT;
 342         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 343                 return -EDOM;
 344
 345         if (tv.tv_sec < 0) {
 346                 static int warned __read_mostly;
 347
 348                 *timeo_p = 0;
 349                 if (warned < 10 && net_ratelimit()) {
 350                         warned++;
 351                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 352                                 __func__, current->comm, task_pid_nr(current));
 353                 }
 354                 return 0;
 355         }
 356         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 357         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 358                 return 0;
 359         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 360                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 361         return 0;
 362 }
 363
 364 static void sock_warn_obsolete_bsdism(const char *name)
 365 {
 366         static int warned;
 367         static char warncomm[TASK_COMM_LEN];
 368         if (strcmp(warncomm, current->comm) && warned < 5) {
 369                 strcpy(warncomm,  current->comm);
 370                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 371                         warncomm, name);
 372                 warned++;
 373         }
 374 }
 375
 376 static bool sock_needs_netstamp(const struct sock *sk)
 377 {
 378         switch (sk->sk_family) {
 379         case AF_UNSPEC:
 380         case AF_UNIX:
 381                 return false;
 382         default:
 383                 return true;
 384         }
 385 }
 386
 387 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 388 {
 389         if (sk->sk_flags & flags) {
 390                 sk->sk_flags &= ~flags;
 391                 if (sock_needs_netstamp(sk) &&
 392                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 393                         net_disable_timestamp();
 394         }
 395 }
 396
 397
 398 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 399 {
 400         unsigned long flags;
 401         struct sk_buff_head *list = &sk->sk_receive_queue;
 402
 403         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 404                 atomic_inc(&sk->sk_drops);
 405                 trace_sock_rcvqueue_full(sk, skb);
 406                 return -ENOMEM;
 407         }
 408
 409         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 410                 atomic_inc(&sk->sk_drops);
 411                 return -ENOBUFS;
 412         }
 413
 414         skb->dev = NULL;
 415         skb_set_owner_r(skb, sk);
 416
 417         /* we escape from rcu protected region, make sure we dont leak
 418          * a norefcounted dst
 419          */
 420         skb_dst_force(skb);
 421
 422         spin_lock_irqsave(&list->lock, flags);
 423         sock_skb_set_dropcount(sk, skb);
 424         __skb_queue_tail(list, skb);
 425         spin_unlock_irqrestore(&list->lock, flags);
 426
 427         if (!sock_flag(sk, SOCK_DEAD))
 428                 sk->sk_data_ready(sk);
 429         return 0;
 430 }
 431 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 432
 433 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 434 {
 435         int err;
 436
 437         err = sk_filter(sk, skb);
 438         if (err)
 439                 return err;
 440
 441         return __sock_queue_rcv_skb(sk, skb);
 442 }
 443 EXPORT_SYMBOL(sock_queue_rcv_skb);
 444
 445 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 446                      const int nested, unsigned int trim_cap, bool refcounted)
 447 {
 448         int rc = NET_RX_SUCCESS;
 449
 450         if (sk_filter_trim_cap(sk, skb, trim_cap))
 451                 goto discard_and_relse;
 452
 453         skb->dev = NULL;
 454
 455         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 456                 atomic_inc(&sk->sk_drops);
 457                 goto discard_and_relse;
 458         }
 459         if (nested)
 460                 bh_lock_sock_nested(sk);
 461         else
 462                 bh_lock_sock(sk);
 463         if (!sock_owned_by_user(sk)) {
 464                 /*
 465                  * trylock + unlock semantics:
 466                  */
 467                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 468
 469                 rc = sk_backlog_rcv(sk, skb);
 470
 471                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 472         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 473                 bh_unlock_sock(sk);
 474                 atomic_inc(&sk->sk_drops);
 475                 goto discard_and_relse;
 476         }
 477
 478         bh_unlock_sock(sk);
 479 out:
 480         if (refcounted)
 481                 sock_put(sk);
 482         return rc;
 483 discard_and_relse:
 484         kfree_skb(skb);
 485         goto out;
 486 }
 487 EXPORT_SYMBOL(__sk_receive_skb);
 488
 489 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 490 {
 491         struct dst_entry *dst = __sk_dst_get(sk);
 492
 493         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 494                 sk_tx_queue_clear(sk);
 495                 sk->sk_dst_pending_confirm = 0;
 496                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 497                 dst_release(dst);
 498                 return NULL;
 499         }
 500
 501         return dst;
 502 }
 503 EXPORT_SYMBOL(__sk_dst_check);
 504
 505 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 506 {
 507         struct dst_entry *dst = sk_dst_get(sk);
 508
 509         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 510                 sk_dst_reset(sk);
 511                 dst_release(dst);
 512                 return NULL;
 513         }
 514
 515         return dst;
 516 }
 517 EXPORT_SYMBOL(sk_dst_check);
 518
 519 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 520                                 int optlen)
 521 {
 522         int ret = -ENOPROTOOPT;
 523 #ifdef CONFIG_NETDEVICES
 524         struct net *net = sock_net(sk);
 525         char devname[IFNAMSIZ];
 526         int index;
 527
 528         /* Sorry... */
 529         ret = -EPERM;
 530         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 531                 goto out;
 532
 533         ret = -EINVAL;
 534         if (optlen < 0)
 535                 goto out;
 536
 537         /* Bind this socket to a particular device like "eth0",
 538          * as specified in the passed interface name. If the
 539          * name is "" or the option length is zero the socket
 540          * is not bound.
 541          */
 542         if (optlen > IFNAMSIZ - 1)
 543                 optlen = IFNAMSIZ - 1;
 544         memset(devname, 0, sizeof(devname));
 545
 546         ret = -EFAULT;
 547         if (copy_from_user(devname, optval, optlen))
 548                 goto out;
 549
 550         index = 0;
 551         if (devname[0] != '\0') {
 552                 struct net_device *dev;
 553
 554                 rcu_read_lock();
 555                 dev = dev_get_by_name_rcu(net, devname);
 556                 if (dev)
 557                         index = dev->ifindex;
 558                 rcu_read_unlock();
 559                 ret = -ENODEV;
 560                 if (!dev)
 561                         goto out;
 562         }
 563
 564         lock_sock(sk);
 565         sk->sk_bound_dev_if = index;
 566         sk_dst_reset(sk);
 567         release_sock(sk);
 568
 569         ret = 0;
 570
 571 out:
 572 #endif
 573
 574         return ret;
 575 }
 576
 577 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 578                                 int __user *optlen, int len)
 579 {
 580         int ret = -ENOPROTOOPT;
 581 #ifdef CONFIG_NETDEVICES
 582         struct net *net = sock_net(sk);
 583         char devname[IFNAMSIZ];
 584
 585         if (sk->sk_bound_dev_if == 0) {
 586                 len = 0;
 587                 goto zero;
 588         }
 589
 590         ret = -EINVAL;
 591         if (len < IFNAMSIZ)
 592                 goto out;
 593
 594         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 595         if (ret)
 596                 goto out;
 597
 598         len = strlen(devname) + 1;
 599
 600         ret = -EFAULT;
 601         if (copy_to_user(optval, devname, len))
 602                 goto out;
 603
 604 zero:
 605         ret = -EFAULT;
 606         if (put_user(len, optlen))
 607                 goto out;
 608
 609         ret = 0;
 610
 611 out:
 612 #endif
 613
 614         return ret;
 615 }
 616
 617 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 618 {
 619         if (valbool)
 620                 sock_set_flag(sk, bit);
 621         else
 622                 sock_reset_flag(sk, bit);
 623 }
 624
 625 bool sk_mc_loop(struct sock *sk)
 626 {
 627         if (dev_recursion_level())
 628                 return false;
 629         if (!sk)
 630                 return true;
 631         switch (sk->sk_family) {
 632         case AF_INET:
 633                 return inet_sk(sk)->mc_loop;
 634 #if IS_ENABLED(CONFIG_IPV6)
 635         case AF_INET6:
 636                 return inet6_sk(sk)->mc_loop;
 637 #endif
 638         }
 639         WARN_ON(1);
 640         return true;
 641 }
 642 EXPORT_SYMBOL(sk_mc_loop);
 643
 644 /*
 645  *      This is meant for all protocols to use and covers goings on
 646  *      at the socket level. Everything here is generic.
 647  */
 648
 649 int sock_setsockopt(struct socket *sock, int level, int optname,
 650                     char __user *optval, unsigned int optlen)
 651 {
 652         struct sock *sk = sock->sk;
 653         int val;
 654         int valbool;
 655         struct linger ling;
 656         int ret = 0;
 657
 658         /*
 659          *      Options without arguments
 660          */
 661
 662         if (optname == SO_BINDTODEVICE)
 663                 return sock_setbindtodevice(sk, optval, optlen);
 664
 665         if (optlen < sizeof(int))
 666                 return -EINVAL;
 667
 668         if (get_user(val, (int __user *)optval))
 669                 return -EFAULT;
 670
 671         valbool = val ? 1 : 0;
 672
 673         lock_sock(sk);
 674
 675         switch (optname) {
 676         case SO_DEBUG:
 677                 if (val && !capable(CAP_NET_ADMIN))
 678                         ret = -EACCES;
 679                 else
 680                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 681                 break;
 682         case SO_REUSEADDR:
 683                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 684                 break;
 685         case SO_REUSEPORT:
 686                 sk->sk_reuseport = valbool;
 687                 break;
 688         case SO_TYPE:
 689         case SO_PROTOCOL:
 690         case SO_DOMAIN:
 691         case SO_ERROR:
 692                 ret = -ENOPROTOOPT;
 693                 break;
 694         case SO_DONTROUTE:
 695                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 696                 break;
 697         case SO_BROADCAST:
 698                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 699                 break;
 700         case SO_SNDBUF:
 701                 /* Don't error on this BSD doesn't and if you think
 702                  * about it this is right. Otherwise apps have to
 703                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 704                  * are treated in BSD as hints
 705                  */
 706                 val = min_t(u32, val, sysctl_wmem_max);
 707 set_sndbuf:
 708                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 709                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 710                 /* Wake up sending tasks if we upped the value. */
 711                 sk->sk_write_space(sk);
 712                 break;
 713
 714         case SO_SNDBUFFORCE:
 715                 if (!capable(CAP_NET_ADMIN)) {
 716                         ret = -EPERM;
 717                         break;
 718                 }
 719                 goto set_sndbuf;
 720
 721         case SO_RCVBUF:
 722                 /* Don't error on this BSD doesn't and if you think
 723                  * about it this is right. Otherwise apps have to
 724                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 725                  * are treated in BSD as hints
 726                  */
 727                 val = min_t(u32, val, sysctl_rmem_max);
 728 set_rcvbuf:
 729                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 730                 /*
 731                  * We double it on the way in to account for
 732                  * "struct sk_buff" etc. overhead.   Applications
 733                  * assume that the SO_RCVBUF setting they make will
 734                  * allow that much actual data to be received on that
 735                  * socket.
 736                  *
 737                  * Applications are unaware that "struct sk_buff" and
 738                  * other overheads allocate from the receive buffer
 739                  * during socket buffer allocation.
 740                  *
 741                  * And after considering the possible alternatives,
 742                  * returning the value we actually used in getsockopt
 743                  * is the most desirable behavior.
 744                  */
 745                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 746                 break;
 747
 748         case SO_RCVBUFFORCE:
 749                 if (!capable(CAP_NET_ADMIN)) {
 750                         ret = -EPERM;
 751                         break;
 752                 }
 753                 goto set_rcvbuf;
 754
 755         case SO_KEEPALIVE:
 756                 if (sk->sk_prot->keepalive)
 757                         sk->sk_prot->keepalive(sk, valbool);
 758                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 759                 break;
 760
 761         case SO_OOBINLINE:
 762                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 763                 break;
 764
 765         case SO_NO_CHECK:
 766                 sk->sk_no_check_tx = valbool;
 767                 break;
 768
 769         case SO_PRIORITY:
 770                 if ((val >= 0 && val <= 6) ||
 771                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 772                         sk->sk_priority = val;
 773                 else
 774                         ret = -EPERM;
 775                 break;
 776
 777         case SO_LINGER:
 778                 if (optlen < sizeof(ling)) {
 779                         ret = -EINVAL;  /* 1003.1g */
 780                         break;
 781                 }
 782                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 783                         ret = -EFAULT;
 784                         break;
 785                 }
 786                 if (!ling.l_onoff)
 787                         sock_reset_flag(sk, SOCK_LINGER);
 788                 else {
 789 #if (BITS_PER_LONG == 32)
 790                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 791                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 792                         else
 793 #endif
 794                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 795                         sock_set_flag(sk, SOCK_LINGER);
 796                 }
 797                 break;
 798
 799         case SO_BSDCOMPAT:
 800                 sock_warn_obsolete_bsdism("setsockopt");
 801                 break;
 802
 803         case SO_PASSCRED:
 804                 if (valbool)
 805                         set_bit(SOCK_PASSCRED, &sock->flags);
 806                 else
 807                         clear_bit(SOCK_PASSCRED, &sock->flags);
 808                 break;
 809
 810         case SO_TIMESTAMP:
 811         case SO_TIMESTAMPNS:
 812                 if (valbool)  {
 813                         if (optname == SO_TIMESTAMP)
 814                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 815                         else
 816                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 817                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 818                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 819                 } else {
 820                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 821                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 822                 }
 823                 break;
 824
 825         case SO_TIMESTAMPING:
 826                 if (val & ~SOF_TIMESTAMPING_MASK) {
 827                         ret = -EINVAL;
 828                         break;
 829                 }
 830
 831                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 832                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 833                         if (sk->sk_protocol == IPPROTO_TCP &&
 834                             sk->sk_type == SOCK_STREAM) {
 835                                 if ((1 << sk->sk_state) &
 836                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 837                                         ret = -EINVAL;
 838                                         break;
 839                                 }
 840                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 841                         } else {
 842                                 sk->sk_tskey = 0;
 843                         }
 844                 }
 845
 846                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 847                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 848                         ret = -EINVAL;
 849                         break;
 850                 }
 851
 852                 sk->sk_tsflags = val;
 853                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 854                         sock_enable_timestamp(sk,
 855                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 856                 else
 857                         sock_disable_timestamp(sk,
 858                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 859                 break;
 860
 861         case SO_RCVLOWAT:
 862                 if (val < 0)
 863                         val = INT_MAX;
 864                 sk->sk_rcvlowat = val ? : 1;
 865                 break;
 866
 867         case SO_RCVTIMEO:
 868                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 869                 break;
 870
 871         case SO_SNDTIMEO:
 872                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 873                 break;
 874
 875         case SO_ATTACH_FILTER:
 876                 ret = -EINVAL;
 877                 if (optlen == sizeof(struct sock_fprog)) {
 878                         struct sock_fprog fprog;
 879
 880                         ret = -EFAULT;
 881                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 882                                 break;
 883
 884                         ret = sk_attach_filter(&fprog, sk);
 885                 }
 886                 break;
 887
 888         case SO_ATTACH_BPF:
 889                 ret = -EINVAL;
 890                 if (optlen == sizeof(u32)) {
 891                         u32 ufd;
 892
 893                         ret = -EFAULT;
 894                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 895                                 break;
 896
 897                         ret = sk_attach_bpf(ufd, sk);
 898                 }
 899                 break;
 900
 901         case SO_ATTACH_REUSEPORT_CBPF:
 902                 ret = -EINVAL;
 903                 if (optlen == sizeof(struct sock_fprog)) {
 904                         struct sock_fprog fprog;
 905
 906                         ret = -EFAULT;
 907                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 908                                 break;
 909
 910                         ret = sk_reuseport_attach_filter(&fprog, sk);
 911                 }
 912                 break;
 913
 914         case SO_ATTACH_REUSEPORT_EBPF:
 915                 ret = -EINVAL;
 916                 if (optlen == sizeof(u32)) {
 917                         u32 ufd;
 918
 919                         ret = -EFAULT;
 920                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 921                                 break;
 922
 923                         ret = sk_reuseport_attach_bpf(ufd, sk);
 924                 }
 925                 break;
 926
 927         case SO_DETACH_FILTER:
 928                 ret = sk_detach_filter(sk);
 929                 break;
 930
 931         case SO_LOCK_FILTER:
 932                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 933                         ret = -EPERM;
 934                 else
 935                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 936                 break;
 937
 938         case SO_PASSSEC:
 939                 if (valbool)
 940                         set_bit(SOCK_PASSSEC, &sock->flags);
 941                 else
 942                         clear_bit(SOCK_PASSSEC, &sock->flags);
 943                 break;
 944         case SO_MARK:
 945                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 946                         ret = -EPERM;
 947                 else
 948                         sk->sk_mark = val;
 949                 break;
 950
 951         case SO_RXQ_OVFL:
 952                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 953                 break;
 954
 955         case SO_WIFI_STATUS:
 956                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 957                 break;
 958
 959         case SO_PEEK_OFF:
 960                 if (sock->ops->set_peek_off)
 961                         ret = sock->ops->set_peek_off(sk, val);
 962                 else
 963                         ret = -EOPNOTSUPP;
 964                 break;
 965
 966         case SO_NOFCS:
 967                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 968                 break;
 969
 970         case SO_SELECT_ERR_QUEUE:
 971                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 972                 break;
 973
 974 #ifdef CONFIG_NET_RX_BUSY_POLL
 975         case SO_BUSY_POLL:
 976                 /* allow unprivileged users to decrease the value */
 977                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 978                         ret = -EPERM;
 979                 else {
 980                         if (val < 0)
 981                                 ret = -EINVAL;
 982                         else
 983                                 sk->sk_ll_usec = val;
 984                 }
 985                 break;
 986 #endif
 987
 988         case SO_MAX_PACING_RATE:
 989                 sk->sk_max_pacing_rate = val;
 990                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 991                                          sk->sk_max_pacing_rate);
 992                 break;
 993
 994         case SO_INCOMING_CPU:
 995                 sk->sk_incoming_cpu = val;
 996                 break;
 997
 998         case SO_CNX_ADVICE:
 999                 if (val == 1)
1000                         dst_negative_advice(sk);
1001                 break;
1002         default:
1003                 ret = -ENOPROTOOPT;
1004                 break;
1005         }
1006         release_sock(sk);
1007         return ret;
1008 }
1009 EXPORT_SYMBOL(sock_setsockopt);
1010
1011
1012 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1013                           struct ucred *ucred)
1014 {
1015         ucred->pid = pid_vnr(pid);
1016         ucred->uid = ucred->gid = -1;
1017         if (cred) {
1018                 struct user_namespace *current_ns = current_user_ns();
1019
1020                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1021                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1022         }
1023 }
1024
1025 int sock_getsockopt(struct socket *sock, int level, int optname,
1026                     char __user *optval, int __user *optlen)
1027 {
1028         struct sock *sk = sock->sk;
1029
1030         union {
1031                 int val;
1032                 struct linger ling;
1033                 struct timeval tm;
1034         } v;
1035
1036         int lv = sizeof(int);
1037         int len;
1038
1039         if (get_user(len, optlen))
1040                 return -EFAULT;
1041         if (len < 0)
1042                 return -EINVAL;
1043
1044         memset(&v, 0, sizeof(v));
1045
1046         switch (optname) {
1047         case SO_DEBUG:
1048                 v.val = sock_flag(sk, SOCK_DBG);
1049                 break;
1050
1051         case SO_DONTROUTE:
1052                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1053                 break;
1054
1055         case SO_BROADCAST:
1056                 v.val = sock_flag(sk, SOCK_BROADCAST);
1057                 break;
1058
1059         case SO_SNDBUF:
1060                 v.val = sk->sk_sndbuf;
1061                 break;
1062
1063         case SO_RCVBUF:
1064                 v.val = sk->sk_rcvbuf;
1065                 break;
1066
1067         case SO_REUSEADDR:
1068                 v.val = sk->sk_reuse;
1069                 break;
1070
1071         case SO_REUSEPORT:
1072                 v.val = sk->sk_reuseport;
1073                 break;
1074
1075         case SO_KEEPALIVE:
1076                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1077                 break;
1078
1079         case SO_TYPE:
1080                 v.val = sk->sk_type;
1081                 break;
1082
1083         case SO_PROTOCOL:
1084                 v.val = sk->sk_protocol;
1085                 break;
1086
1087         case SO_DOMAIN:
1088                 v.val = sk->sk_family;
1089                 break;
1090
1091         case SO_ERROR:
1092                 v.val = -sock_error(sk);
1093                 if (v.val == 0)
1094                         v.val = xchg(&sk->sk_err_soft, 0);
1095                 break;
1096
1097         case SO_OOBINLINE:
1098                 v.val = sock_flag(sk, SOCK_URGINLINE);
1099                 break;
1100
1101         case SO_NO_CHECK:
1102                 v.val = sk->sk_no_check_tx;
1103                 break;
1104
1105         case SO_PRIORITY:
1106                 v.val = sk->sk_priority;
1107                 break;
1108
1109         case SO_LINGER:
1110                 lv              = sizeof(v.ling);
1111                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1112                 v.ling.l_linger = sk->sk_lingertime / HZ;
1113                 break;
1114
1115         case SO_BSDCOMPAT:
1116                 sock_warn_obsolete_bsdism("getsockopt");
1117                 break;
1118
1119         case SO_TIMESTAMP:
1120                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1121                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1122                 break;
1123
1124         case SO_TIMESTAMPNS:
1125                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1126                 break;
1127
1128         case SO_TIMESTAMPING:
1129                 v.val = sk->sk_tsflags;
1130                 break;
1131
1132         case SO_RCVTIMEO:
1133                 lv = sizeof(struct timeval);
1134                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1135                         v.tm.tv_sec = 0;
1136                         v.tm.tv_usec = 0;
1137                 } else {
1138                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1139                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1140                 }
1141                 break;
1142
1143         case SO_SNDTIMEO:
1144                 lv = sizeof(struct timeval);
1145                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1146                         v.tm.tv_sec = 0;
1147                         v.tm.tv_usec = 0;
1148                 } else {
1149                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1150                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1151                 }
1152                 break;
1153
1154         case SO_RCVLOWAT:
1155                 v.val = sk->sk_rcvlowat;
1156                 break;
1157
1158         case SO_SNDLOWAT:
1159                 v.val = 1;
1160                 break;
1161
1162         case SO_PASSCRED:
1163                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1164                 break;
1165
1166         case SO_PEERCRED:
1167         {
1168                 struct ucred peercred;
1169                 if (len > sizeof(peercred))
1170                         len = sizeof(peercred);
1171                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1172                 if (copy_to_user(optval, &peercred, len))
1173                         return -EFAULT;
1174                 goto lenout;
1175         }
1176
1177         case SO_PEERNAME:
1178         {
1179                 char address[128];
1180
1181                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1182                         return -ENOTCONN;
1183                 if (lv < len)
1184                         return -EINVAL;
1185                 if (copy_to_user(optval, address, len))
1186                         return -EFAULT;
1187                 goto lenout;
1188         }
1189
1190         /* Dubious BSD thing... Probably nobody even uses it, but
1191          * the UNIX standard wants it for whatever reason... -DaveM
1192          */
1193         case SO_ACCEPTCONN:
1194                 v.val = sk->sk_state == TCP_LISTEN;
1195                 break;
1196
1197         case SO_PASSSEC:
1198                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1199                 break;
1200
1201         case SO_PEERSEC:
1202                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1203
1204         case SO_MARK:
1205                 v.val = sk->sk_mark;
1206                 break;
1207
1208         case SO_RXQ_OVFL:
1209                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1210                 break;
1211
1212         case SO_WIFI_STATUS:
1213                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1214                 break;
1215
1216         case SO_PEEK_OFF:
1217                 if (!sock->ops->set_peek_off)
1218                         return -EOPNOTSUPP;
1219
1220                 v.val = sk->sk_peek_off;
1221                 break;
1222         case SO_NOFCS:
1223                 v.val = sock_flag(sk, SOCK_NOFCS);
1224                 break;
1225
1226         case SO_BINDTODEVICE:
1227                 return sock_getbindtodevice(sk, optval, optlen, len);
1228
1229         case SO_GET_FILTER:
1230                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1231                 if (len < 0)
1232                         return len;
1233
1234                 goto lenout;
1235
1236         case SO_LOCK_FILTER:
1237                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1238                 break;
1239
1240         case SO_BPF_EXTENSIONS:
1241                 v.val = bpf_tell_extensions();
1242                 break;
1243
1244         case SO_SELECT_ERR_QUEUE:
1245                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1246                 break;
1247
1248 #ifdef CONFIG_NET_RX_BUSY_POLL
1249         case SO_BUSY_POLL:
1250                 v.val = sk->sk_ll_usec;
1251                 break;
1252 #endif
1253
1254         case SO_MAX_PACING_RATE:
1255                 v.val = sk->sk_max_pacing_rate;
1256                 break;
1257
1258         case SO_INCOMING_CPU:
1259                 v.val = sk->sk_incoming_cpu;
1260                 break;
1261
1262         default:
1263                 /* We implement the SO_SNDLOWAT etc to not be settable
1264                  * (1003.1g 7).
1265                  */
1266                 return -ENOPROTOOPT;
1267         }
1268
1269         if (len > lv)
1270                 len = lv;
1271         if (copy_to_user(optval, &v, len))
1272                 return -EFAULT;
1273 lenout:
1274         if (put_user(len, optlen))
1275                 return -EFAULT;
1276         return 0;
1277 }
1278
1279 /*
1280  * Initialize an sk_lock.
1281  *
1282  * (We also register the sk_lock with the lock validator.)
1283  */
1284 static inline void sock_lock_init(struct sock *sk)
1285 {
1286         if (sk->sk_kern_sock)
1287                 sock_lock_init_class_and_name(
1288                         sk,
1289                         af_family_kern_slock_key_strings[sk->sk_family],
1290                         af_family_kern_slock_keys + sk->sk_family,
1291                         af_family_kern_key_strings[sk->sk_family],
1292                         af_family_kern_keys + sk->sk_family);
1293         else
1294                 sock_lock_init_class_and_name(
1295                         sk,
1296                         af_family_slock_key_strings[sk->sk_family],
1297                         af_family_slock_keys + sk->sk_family,
1298                         af_family_key_strings[sk->sk_family],
1299                         af_family_keys + sk->sk_family);
1300 }
1301
1302 /*
1303  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1304  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1305  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1306  */
1307 static void sock_copy(struct sock *nsk, const struct sock *osk)
1308 {
1309 #ifdef CONFIG_SECURITY_NETWORK
1310         void *sptr = nsk->sk_security;
1311 #endif
1312         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1313
1314         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1315                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1316
1317 #ifdef CONFIG_SECURITY_NETWORK
1318         nsk->sk_security = sptr;
1319         security_sk_clone(osk, nsk);
1320 #endif
1321 }
1322
1323 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1324                 int family)
1325 {
1326         struct sock *sk;
1327         struct kmem_cache *slab;
1328
1329         slab = prot->slab;
1330         if (slab != NULL) {
1331                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1332                 if (!sk)
1333                         return sk;
1334                 if (priority & __GFP_ZERO)
1335                         sk_prot_clear_nulls(sk, prot->obj_size);
1336         } else
1337                 sk = kmalloc(prot->obj_size, priority);
1338
1339         if (sk != NULL) {
1340                 kmemcheck_annotate_bitfield(sk, flags);
1341
1342                 if (security_sk_alloc(sk, family, priority))
1343                         goto out_free;
1344
1345                 if (!try_module_get(prot->owner))
1346                         goto out_free_sec;
1347                 sk_tx_queue_clear(sk);
1348         }
1349
1350         return sk;
1351
1352 out_free_sec:
1353         security_sk_free(sk);
1354 out_free:
1355         if (slab != NULL)
1356                 kmem_cache_free(slab, sk);
1357         else
1358                 kfree(sk);
1359         return NULL;
1360 }
1361
1362 static void sk_prot_free(struct proto *prot, struct sock *sk)
1363 {
1364         struct kmem_cache *slab;
1365         struct module *owner;
1366
1367         owner = prot->owner;
1368         slab = prot->slab;
1369
1370         cgroup_sk_free(&sk->sk_cgrp_data);
1371         mem_cgroup_sk_free(sk);
1372         security_sk_free(sk);
1373         if (slab != NULL)
1374                 kmem_cache_free(slab, sk);
1375         else
1376                 kfree(sk);
1377         module_put(owner);
1378 }
1379
1380 /**
1381  *      sk_alloc - All socket objects are allocated here
1382  *      @net: the applicable net namespace
1383  *      @family: protocol family
1384  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1385  *      @prot: struct proto associated with this new sock instance
1386  *      @kern: is this to be a kernel socket?
1387  */
1388 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1389                       struct proto *prot, int kern)
1390 {
1391         struct sock *sk;
1392
1393         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1394         if (sk) {
1395                 sk->sk_family = family;
1396                 /*
1397                  * See comment in struct sock definition to understand
1398                  * why we need sk_prot_creator -acme
1399                  */
1400                 sk->sk_prot = sk->sk_prot_creator = prot;
1401                 sk->sk_kern_sock = kern;
1402                 sock_lock_init(sk);
1403                 sk->sk_net_refcnt = kern ? 0 : 1;
1404                 if (likely(sk->sk_net_refcnt))
1405                         get_net(net);
1406                 sock_net_set(sk, net);
1407                 atomic_set(&sk->sk_wmem_alloc, 1);
1408
1409                 mem_cgroup_sk_alloc(sk);
1410                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1411                 sock_update_classid(&sk->sk_cgrp_data);
1412                 sock_update_netprioidx(&sk->sk_cgrp_data);
1413         }
1414
1415         return sk;
1416 }
1417 EXPORT_SYMBOL(sk_alloc);
1418
1419 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1420  * grace period. This is the case for UDP sockets and TCP listeners.
1421  */
1422 static void __sk_destruct(struct rcu_head *head)
1423 {
1424         struct sock *sk = container_of(head, struct sock, sk_rcu);
1425         struct sk_filter *filter;
1426
1427         if (sk->sk_destruct)
1428                 sk->sk_destruct(sk);
1429
1430         filter = rcu_dereference_check(sk->sk_filter,
1431                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1432         if (filter) {
1433                 sk_filter_uncharge(sk, filter);
1434                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1435         }
1436         if (rcu_access_pointer(sk->sk_reuseport_cb))
1437                 reuseport_detach_sock(sk);
1438
1439         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1440
1441         if (atomic_read(&sk->sk_omem_alloc))
1442                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1443                          __func__, atomic_read(&sk->sk_omem_alloc));
1444
1445         if (sk->sk_frag.page) {
1446                 put_page(sk->sk_frag.page);
1447                 sk->sk_frag.page = NULL;
1448         }
1449
1450         if (sk->sk_peer_cred)
1451                 put_cred(sk->sk_peer_cred);
1452         put_pid(sk->sk_peer_pid);
1453         if (likely(sk->sk_net_refcnt))
1454                 put_net(sock_net(sk));
1455         sk_prot_free(sk->sk_prot_creator, sk);
1456 }
1457
1458 void sk_destruct(struct sock *sk)
1459 {
1460         if (sock_flag(sk, SOCK_RCU_FREE))
1461                 call_rcu(&sk->sk_rcu, __sk_destruct);
1462         else
1463                 __sk_destruct(&sk->sk_rcu);
1464 }
1465
1466 static void __sk_free(struct sock *sk)
1467 {
1468         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1469                 sock_diag_broadcast_destroy(sk);
1470         else
1471                 sk_destruct(sk);
1472 }
1473
1474 void sk_free(struct sock *sk)
1475 {
1476         /*
1477          * We subtract one from sk_wmem_alloc and can know if
1478          * some packets are still in some tx queue.
1479          * If not null, sock_wfree() will call __sk_free(sk) later
1480          */
1481         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1482                 __sk_free(sk);
1483 }
1484 EXPORT_SYMBOL(sk_free);
1485
1486 /**
1487  *      sk_clone_lock - clone a socket, and lock its clone
1488  *      @sk: the socket to clone
1489  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490  *
1491  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492  */
1493 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494 {
1495         struct sock *newsk;
1496         bool is_charged = true;
1497
1498         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1499         if (newsk != NULL) {
1500                 struct sk_filter *filter;
1501
1502                 sock_copy(newsk, sk);
1503
1504                 /* SANITY */
1505                 if (likely(newsk->sk_net_refcnt))
1506                         get_net(sock_net(newsk));
1507                 sk_node_init(&newsk->sk_node);
1508                 sock_lock_init(newsk);
1509                 bh_lock_sock(newsk);
1510                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1511                 newsk->sk_backlog.len = 0;
1512
1513                 atomic_set(&newsk->sk_rmem_alloc, 0);
1514                 /*
1515                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1516                  */
1517                 atomic_set(&newsk->sk_wmem_alloc, 1);
1518                 atomic_set(&newsk->sk_omem_alloc, 0);
1519                 skb_queue_head_init(&newsk->sk_receive_queue);
1520                 skb_queue_head_init(&newsk->sk_write_queue);
1521
1522                 rwlock_init(&newsk->sk_callback_lock);
1523                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1524                                 af_callback_keys + newsk->sk_family,
1525                                 af_family_clock_key_strings[newsk->sk_family]);
1526
1527                 newsk->sk_dst_cache     = NULL;
1528                 newsk->sk_dst_pending_confirm = 0;
1529                 newsk->sk_wmem_queued   = 0;
1530                 newsk->sk_forward_alloc = 0;
1531                 atomic_set(&newsk->sk_drops, 0);
1532                 newsk->sk_send_head     = NULL;
1533                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1534
1535                 sock_reset_flag(newsk, SOCK_DONE);
1536                 skb_queue_head_init(&newsk->sk_error_queue);
1537
1538                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1539                 if (filter != NULL)
1540                         /* though it's an empty new sock, the charging may fail
1541                          * if sysctl_optmem_max was changed between creation of
1542                          * original socket and cloning
1543                          */
1544                         is_charged = sk_filter_charge(newsk, filter);
1545
1546                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1547                         /* We need to make sure that we don't uncharge the new
1548                          * socket if we couldn't charge it in the first place
1549                          * as otherwise we uncharge the parent's filter.
1550                          */
1551                         if (!is_charged)
1552                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1553                         sk_free_unlock_clone(newsk);
1554                         newsk = NULL;
1555                         goto out;
1556                 }
1557                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1558
1559                 newsk->sk_err      = 0;
1560                 newsk->sk_err_soft = 0;
1561                 newsk->sk_priority = 0;
1562                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1563                 atomic64_set(&newsk->sk_cookie, 0);
1564
1565                 mem_cgroup_sk_alloc(newsk);
1566                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1567
1568                 /*
1569                  * Before updating sk_refcnt, we must commit prior changes to memory
1570                  * (Documentation/RCU/rculist_nulls.txt for details)
1571                  */
1572                 smp_wmb();
1573                 atomic_set(&newsk->sk_refcnt, 2);
1574
1575                 /*
1576                  * Increment the counter in the same struct proto as the master
1577                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1578                  * is the same as sk->sk_prot->socks, as this field was copied
1579                  * with memcpy).
1580                  *
1581                  * This _changes_ the previous behaviour, where
1582                  * tcp_create_openreq_child always was incrementing the
1583                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1584                  * to be taken into account in all callers. -acme
1585                  */
1586                 sk_refcnt_debug_inc(newsk);
1587                 sk_set_socket(newsk, NULL);
1588                 newsk->sk_wq = NULL;
1589
1590                 if (newsk->sk_prot->sockets_allocated)
1591                         sk_sockets_allocated_inc(newsk);
1592
1593                 if (sock_needs_netstamp(sk) &&
1594                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1595                         net_enable_timestamp();
1596         }
1597 out:
1598         return newsk;
1599 }
1600 EXPORT_SYMBOL_GPL(sk_clone_lock);
1601
1602 void sk_free_unlock_clone(struct sock *sk)
1603 {
1604         /* It is still raw copy of parent, so invalidate
1605          * destructor and make plain sk_free() */
1606         sk->sk_destruct = NULL;
1607         bh_unlock_sock(sk);
1608         sk_free(sk);
1609 }
1610 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1611
1612 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1613 {
1614         u32 max_segs = 1;
1615
1616         sk_dst_set(sk, dst);
1617         sk->sk_route_caps = dst->dev->features;
1618         if (sk->sk_route_caps & NETIF_F_GSO)
1619                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1620         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1621         if (sk_can_gso(sk)) {
1622                 if (dst->header_len) {
1623                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1624                 } else {
1625                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1626                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1627                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1628                 }
1629         }
1630         sk->sk_gso_max_segs = max_segs;
1631 }
1632 EXPORT_SYMBOL_GPL(sk_setup_caps);
1633
1634 /*
1635  *      Simple resource managers for sockets.
1636  */
1637
1638
1639 /*
1640  * Write buffer destructor automatically called from kfree_skb.
1641  */
1642 void sock_wfree(struct sk_buff *skb)
1643 {
1644         struct sock *sk = skb->sk;
1645         unsigned int len = skb->truesize;
1646
1647         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1648                 /*
1649                  * Keep a reference on sk_wmem_alloc, this will be released
1650                  * after sk_write_space() call
1651                  */
1652                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1653                 sk->sk_write_space(sk);
1654                 len = 1;
1655         }
1656         /*
1657          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1658          * could not do because of in-flight packets
1659          */
1660         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1661                 __sk_free(sk);
1662 }
1663 EXPORT_SYMBOL(sock_wfree);
1664
1665 /* This variant of sock_wfree() is used by TCP,
1666  * since it sets SOCK_USE_WRITE_QUEUE.
1667  */
1668 void __sock_wfree(struct sk_buff *skb)
1669 {
1670         struct sock *sk = skb->sk;
1671
1672         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1673                 __sk_free(sk);
1674 }
1675
1676 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1677 {
1678         skb_orphan(skb);
1679         skb->sk = sk;
1680 #ifdef CONFIG_INET
1681         if (unlikely(!sk_fullsock(sk))) {
1682                 skb->destructor = sock_edemux;
1683                 sock_hold(sk);
1684                 return;
1685         }
1686 #endif
1687         skb->destructor = sock_wfree;
1688         skb_set_hash_from_sk(skb, sk);
1689         /*
1690          * We used to take a refcount on sk, but following operation
1691          * is enough to guarantee sk_free() wont free this sock until
1692          * all in-flight packets are completed
1693          */
1694         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1695 }
1696 EXPORT_SYMBOL(skb_set_owner_w);
1697
1698 /* This helper is used by netem, as it can hold packets in its
1699  * delay queue. We want to allow the owner socket to send more
1700  * packets, as if they were already TX completed by a typical driver.
1701  * But we also want to keep skb->sk set because some packet schedulers
1702  * rely on it (sch_fq for example). So we set skb->truesize to a small
1703  * amount (1) and decrease sk_wmem_alloc accordingly.
1704  */
1705 void skb_orphan_partial(struct sk_buff *skb)
1706 {
1707         /* If this skb is a TCP pure ACK or already went here,
1708          * we have nothing to do. 2 is already a very small truesize.
1709          */
1710         if (skb->truesize <= 2)
1711                 return;
1712
1713         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1714          * so we do not completely orphan skb, but transfert all
1715          * accounted bytes but one, to avoid unexpected reorders.
1716          */
1717         if (skb->destructor == sock_wfree
1718 #ifdef CONFIG_INET
1719             || skb->destructor == tcp_wfree
1720 #endif
1721                 ) {
1722                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1723                 skb->truesize = 1;
1724         } else {
1725                 skb_orphan(skb);
1726         }
1727 }
1728 EXPORT_SYMBOL(skb_orphan_partial);
1729
1730 /*
1731  * Read buffer destructor automatically called from kfree_skb.
1732  */
1733 void sock_rfree(struct sk_buff *skb)
1734 {
1735         struct sock *sk = skb->sk;
1736         unsigned int len = skb->truesize;
1737
1738         atomic_sub(len, &sk->sk_rmem_alloc);
1739         sk_mem_uncharge(sk, len);
1740 }
1741 EXPORT_SYMBOL(sock_rfree);
1742
1743 /*
1744  * Buffer destructor for skbs that are not used directly in read or write
1745  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1746  */
1747 void sock_efree(struct sk_buff *skb)
1748 {
1749         sock_put(skb->sk);
1750 }
1751 EXPORT_SYMBOL(sock_efree);
1752
1753 kuid_t sock_i_uid(struct sock *sk)
1754 {
1755         kuid_t uid;
1756
1757         read_lock_bh(&sk->sk_callback_lock);
1758         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1759         read_unlock_bh(&sk->sk_callback_lock);
1760         return uid;
1761 }
1762 EXPORT_SYMBOL(sock_i_uid);
1763
1764 unsigned long sock_i_ino(struct sock *sk)
1765 {
1766         unsigned long ino;
1767
1768         read_lock_bh(&sk->sk_callback_lock);
1769         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1770         read_unlock_bh(&sk->sk_callback_lock);
1771         return ino;
1772 }
1773 EXPORT_SYMBOL(sock_i_ino);
1774
1775 /*
1776  * Allocate a skb from the socket's send buffer.
1777  */
1778 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1779                              gfp_t priority)
1780 {
1781         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1782                 struct sk_buff *skb = alloc_skb(size, priority);
1783                 if (skb) {
1784                         skb_set_owner_w(skb, sk);
1785                         return skb;
1786                 }
1787         }
1788         return NULL;
1789 }
1790 EXPORT_SYMBOL(sock_wmalloc);
1791
1792 /*
1793  * Allocate a memory block from the socket's option memory buffer.
1794  */
1795 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1796 {
1797         if ((unsigned int)size <= sysctl_optmem_max &&
1798             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1799                 void *mem;
1800                 /* First do the add, to avoid the race if kmalloc
1801                  * might sleep.
1802                  */
1803                 atomic_add(size, &sk->sk_omem_alloc);
1804                 mem = kmalloc(size, priority);
1805                 if (mem)
1806                         return mem;
1807                 atomic_sub(size, &sk->sk_omem_alloc);
1808         }
1809         return NULL;
1810 }
1811 EXPORT_SYMBOL(sock_kmalloc);
1812
1813 /* Free an option memory block. Note, we actually want the inline
1814  * here as this allows gcc to detect the nullify and fold away the
1815  * condition entirely.
1816  */
1817 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1818                                   const bool nullify)
1819 {
1820         if (WARN_ON_ONCE(!mem))
1821                 return;
1822         if (nullify)
1823                 kzfree(mem);
1824         else
1825                 kfree(mem);
1826         atomic_sub(size, &sk->sk_omem_alloc);
1827 }
1828
1829 void sock_kfree_s(struct sock *sk, void *mem, int size)
1830 {
1831         __sock_kfree_s(sk, mem, size, false);
1832 }
1833 EXPORT_SYMBOL(sock_kfree_s);
1834
1835 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1836 {
1837         __sock_kfree_s(sk, mem, size, true);
1838 }
1839 EXPORT_SYMBOL(sock_kzfree_s);
1840
1841 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1842    I think, these locks should be removed for datagram sockets.
1843  */
1844 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1845 {
1846         DEFINE_WAIT(wait);
1847
1848         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1849         for (;;) {
1850                 if (!timeo)
1851                         break;
1852                 if (signal_pending(current))
1853                         break;
1854                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1855                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1856                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1857                         break;
1858                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1859                         break;
1860                 if (sk->sk_err)
1861                         break;
1862                 timeo = schedule_timeout(timeo);
1863         }
1864         finish_wait(sk_sleep(sk), &wait);
1865         return timeo;
1866 }
1867
1868
1869 /*
1870  *      Generic send/receive buffer handlers
1871  */
1872
1873 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1874                                      unsigned long data_len, int noblock,
1875                                      int *errcode, int max_page_order)
1876 {
1877         struct sk_buff *skb;
1878         long timeo;
1879         int err;
1880
1881         timeo = sock_sndtimeo(sk, noblock);
1882         for (;;) {
1883                 err = sock_error(sk);
1884                 if (err != 0)
1885                         goto failure;
1886
1887                 err = -EPIPE;
1888                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1889                         goto failure;
1890
1891                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1892                         break;
1893
1894                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1895                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1896                 err = -EAGAIN;
1897                 if (!timeo)
1898                         goto failure;
1899                 if (signal_pending(current))
1900                         goto interrupted;
1901                 timeo = sock_wait_for_wmem(sk, timeo);
1902         }
1903         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1904                                    errcode, sk->sk_allocation);
1905         if (skb)
1906                 skb_set_owner_w(skb, sk);
1907         return skb;
1908
1909 interrupted:
1910         err = sock_intr_errno(timeo);
1911 failure:
1912         *errcode = err;
1913         return NULL;
1914 }
1915 EXPORT_SYMBOL(sock_alloc_send_pskb);
1916
1917 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1918                                     int noblock, int *errcode)
1919 {
1920         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1921 }
1922 EXPORT_SYMBOL(sock_alloc_send_skb);
1923
1924 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1925                      struct sockcm_cookie *sockc)
1926 {
1927         u32 tsflags;
1928
1929         switch (cmsg->cmsg_type) {
1930         case SO_MARK:
1931                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1932                         return -EPERM;
1933                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1934                         return -EINVAL;
1935                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1936                 break;
1937         case SO_TIMESTAMPING:
1938                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1939                         return -EINVAL;
1940
1941                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1942                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1943                         return -EINVAL;
1944
1945                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1946                 sockc->tsflags |= tsflags;
1947                 break;
1948         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1949         case SCM_RIGHTS:
1950         case SCM_CREDENTIALS:
1951                 break;
1952         default:
1953                 return -EINVAL;
1954         }
1955         return 0;
1956 }
1957 EXPORT_SYMBOL(__sock_cmsg_send);
1958
1959 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1960                    struct sockcm_cookie *sockc)
1961 {
1962         struct cmsghdr *cmsg;
1963         int ret;
1964
1965         for_each_cmsghdr(cmsg, msg) {
1966                 if (!CMSG_OK(msg, cmsg))
1967                         return -EINVAL;
1968                 if (cmsg->cmsg_level != SOL_SOCKET)
1969                         continue;
1970                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1971                 if (ret)
1972                         return ret;
1973         }
1974         return 0;
1975 }
1976 EXPORT_SYMBOL(sock_cmsg_send);
1977
1978 /* On 32bit arches, an skb frag is limited to 2^15 */
1979 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1980
1981 /**
1982  * skb_page_frag_refill - check that a page_frag contains enough room
1983  * @sz: minimum size of the fragment we want to get
1984  * @pfrag: pointer to page_frag
1985  * @gfp: priority for memory allocation
1986  *
1987  * Note: While this allocator tries to use high order pages, there is
1988  * no guarantee that allocations succeed. Therefore, @sz MUST be
1989  * less or equal than PAGE_SIZE.
1990  */
1991 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1992 {
1993         if (pfrag->page) {
1994                 if (page_ref_count(pfrag->page) == 1) {
1995                         pfrag->offset = 0;
1996                         return true;
1997                 }
1998                 if (pfrag->offset + sz <= pfrag->size)
1999                         return true;
2000                 put_page(pfrag->page);
2001         }
2002
2003         pfrag->offset = 0;
2004         if (SKB_FRAG_PAGE_ORDER) {
2005                 /* Avoid direct reclaim but allow kswapd to wake */
2006                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2007                                           __GFP_COMP | __GFP_NOWARN |
2008                                           __GFP_NORETRY,
2009                                           SKB_FRAG_PAGE_ORDER);
2010                 if (likely(pfrag->page)) {
2011                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2012                         return true;
2013                 }
2014         }
2015         pfrag->page = alloc_page(gfp);
2016         if (likely(pfrag->page)) {
2017                 pfrag->size = PAGE_SIZE;
2018                 return true;
2019         }
2020         return false;
2021 }
2022 EXPORT_SYMBOL(skb_page_frag_refill);
2023
2024 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2025 {
2026         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2027                 return true;
2028
2029         sk_enter_memory_pressure(sk);
2030         sk_stream_moderate_sndbuf(sk);
2031         return false;
2032 }
2033 EXPORT_SYMBOL(sk_page_frag_refill);
2034
2035 static void __lock_sock(struct sock *sk)
2036         __releases(&sk->sk_lock.slock)
2037         __acquires(&sk->sk_lock.slock)
2038 {
2039         DEFINE_WAIT(wait);
2040
2041         for (;;) {
2042                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2043                                         TASK_UNINTERRUPTIBLE);
2044                 spin_unlock_bh(&sk->sk_lock.slock);
2045                 schedule();
2046                 spin_lock_bh(&sk->sk_lock.slock);
2047                 if (!sock_owned_by_user(sk))
2048                         break;
2049         }
2050         finish_wait(&sk->sk_lock.wq, &wait);
2051 }
2052
2053 static void __release_sock(struct sock *sk)
2054         __releases(&sk->sk_lock.slock)
2055         __acquires(&sk->sk_lock.slock)
2056 {
2057         struct sk_buff *skb, *next;
2058
2059         while ((skb = sk->sk_backlog.head) != NULL) {
2060                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2061
2062                 spin_unlock_bh(&sk->sk_lock.slock);
2063
2064                 do {
2065                         next = skb->next;
2066                         prefetch(next);
2067                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2068                         skb->next = NULL;
2069                         sk_backlog_rcv(sk, skb);
2070
2071                         cond_resched();
2072
2073                         skb = next;
2074                 } while (skb != NULL);
2075
2076                 spin_lock_bh(&sk->sk_lock.slock);
2077         }
2078
2079         /*
2080          * Doing the zeroing here guarantee we can not loop forever
2081          * while a wild producer attempts to flood us.
2082          */
2083         sk->sk_backlog.len = 0;
2084 }
2085
2086 void __sk_flush_backlog(struct sock *sk)
2087 {
2088         spin_lock_bh(&sk->sk_lock.slock);
2089         __release_sock(sk);
2090         spin_unlock_bh(&sk->sk_lock.slock);
2091 }
2092
2093 /**
2094  * sk_wait_data - wait for data to arrive at sk_receive_queue
2095  * @sk:    sock to wait on
2096  * @timeo: for how long
2097  * @skb:   last skb seen on sk_receive_queue
2098  *
2099  * Now socket state including sk->sk_err is changed only under lock,
2100  * hence we may omit checks after joining wait queue.
2101  * We check receive queue before schedule() only as optimization;
2102  * it is very likely that release_sock() added new data.
2103  */
2104 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2105 {
2106         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2107         int rc;
2108
2109         add_wait_queue(sk_sleep(sk), &wait);
2110         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2111         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2112         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2113         remove_wait_queue(sk_sleep(sk), &wait);
2114         return rc;
2115 }
2116 EXPORT_SYMBOL(sk_wait_data);
2117
2118 /**
2119  *      __sk_mem_raise_allocated - increase memory_allocated
2120  *      @sk: socket
2121  *      @size: memory size to allocate
2122  *      @amt: pages to allocate
2123  *      @kind: allocation type
2124  *
2125  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2126  */
2127 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2128 {
2129         struct proto *prot = sk->sk_prot;
2130         long allocated = sk_memory_allocated_add(sk, amt);
2131
2132         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2133             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2134                 goto suppress_allocation;
2135
2136         /* Under limit. */
2137         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2138                 sk_leave_memory_pressure(sk);
2139                 return 1;
2140         }
2141
2142         /* Under pressure. */
2143         if (allocated > sk_prot_mem_limits(sk, 1))
2144                 sk_enter_memory_pressure(sk);
2145
2146         /* Over hard limit. */
2147         if (allocated > sk_prot_mem_limits(sk, 2))
2148                 goto suppress_allocation;
2149
2150         /* guarantee minimum buffer size under pressure */
2151         if (kind == SK_MEM_RECV) {
2152                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2153                         return 1;
2154
2155         } else { /* SK_MEM_SEND */
2156                 if (sk->sk_type == SOCK_STREAM) {
2157                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2158                                 return 1;
2159                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2160                            prot->sysctl_wmem[0])
2161                                 return 1;
2162         }
2163
2164         if (sk_has_memory_pressure(sk)) {
2165                 int alloc;
2166
2167                 if (!sk_under_memory_pressure(sk))
2168                         return 1;
2169                 alloc = sk_sockets_allocated_read_positive(sk);
2170                 if (sk_prot_mem_limits(sk, 2) > alloc *
2171                     sk_mem_pages(sk->sk_wmem_queued +
2172                                  atomic_read(&sk->sk_rmem_alloc) +
2173                                  sk->sk_forward_alloc))
2174                         return 1;
2175         }
2176
2177 suppress_allocation:
2178
2179         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2180                 sk_stream_moderate_sndbuf(sk);
2181
2182                 /* Fail only if socket is _under_ its sndbuf.
2183                  * In this case we cannot block, so that we have to fail.
2184                  */
2185                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2186                         return 1;
2187         }
2188
2189         trace_sock_exceed_buf_limit(sk, prot, allocated);
2190
2191         sk_memory_allocated_sub(sk, amt);
2192
2193         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2194                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2195
2196         return 0;
2197 }
2198 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2199
2200 /**
2201  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2202  *      @sk: socket
2203  *      @size: memory size to allocate
2204  *      @kind: allocation type
2205  *
2206  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2207  *      rmem allocation. This function assumes that protocols which have
2208  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2209  */
2210 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2211 {
2212         int ret, amt = sk_mem_pages(size);
2213
2214         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2215         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2216         if (!ret)
2217                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2218         return ret;
2219 }
2220 EXPORT_SYMBOL(__sk_mem_schedule);
2221
2222 /**
2223  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2224  *      @sk: socket
2225  *      @amount: number of quanta
2226  *
2227  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2228  */
2229 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2230 {
2231         sk_memory_allocated_sub(sk, amount);
2232
2233         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2234                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2235
2236         if (sk_under_memory_pressure(sk) &&
2237             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2238                 sk_leave_memory_pressure(sk);
2239 }
2240 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2241
2242 /**
2243  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2244  *      @sk: socket
2245  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2246  */
2247 void __sk_mem_reclaim(struct sock *sk, int amount)
2248 {
2249         amount >>= SK_MEM_QUANTUM_SHIFT;
2250         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2251         __sk_mem_reduce_allocated(sk, amount);
2252 }
2253 EXPORT_SYMBOL(__sk_mem_reclaim);
2254
2255 int sk_set_peek_off(struct sock *sk, int val)
2256 {
2257         if (val < 0)
2258                 return -EINVAL;
2259
2260         sk->sk_peek_off = val;
2261         return 0;
2262 }
2263 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2264
2265 /*
2266  * Set of default routines for initialising struct proto_ops when
2267  * the protocol does not support a particular function. In certain
2268  * cases where it makes no sense for a protocol to have a "do nothing"
2269  * function, some default processing is provided.
2270  */
2271
2272 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2273 {
2274         return -EOPNOTSUPP;
2275 }
2276 EXPORT_SYMBOL(sock_no_bind);
2277
2278 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2279                     int len, int flags)
2280 {
2281         return -EOPNOTSUPP;
2282 }
2283 EXPORT_SYMBOL(sock_no_connect);
2284
2285 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2286 {
2287         return -EOPNOTSUPP;
2288 }
2289 EXPORT_SYMBOL(sock_no_socketpair);
2290
2291 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2292                    bool kern)
2293 {
2294         return -EOPNOTSUPP;
2295 }
2296 EXPORT_SYMBOL(sock_no_accept);
2297
2298 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2299                     int *len, int peer)
2300 {
2301         return -EOPNOTSUPP;
2302 }
2303 EXPORT_SYMBOL(sock_no_getname);
2304
2305 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2306 {
2307         return 0;
2308 }
2309 EXPORT_SYMBOL(sock_no_poll);
2310
2311 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2312 {
2313         return -EOPNOTSUPP;
2314 }
2315 EXPORT_SYMBOL(sock_no_ioctl);
2316
2317 int sock_no_listen(struct socket *sock, int backlog)
2318 {
2319         return -EOPNOTSUPP;
2320 }
2321 EXPORT_SYMBOL(sock_no_listen);
2322
2323 int sock_no_shutdown(struct socket *sock, int how)
2324 {
2325         return -EOPNOTSUPP;
2326 }
2327 EXPORT_SYMBOL(sock_no_shutdown);
2328
2329 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2330                     char __user *optval, unsigned int optlen)
2331 {
2332         return -EOPNOTSUPP;
2333 }
2334 EXPORT_SYMBOL(sock_no_setsockopt);
2335
2336 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2337                     char __user *optval, int __user *optlen)
2338 {
2339         return -EOPNOTSUPP;
2340 }
2341 EXPORT_SYMBOL(sock_no_getsockopt);
2342
2343 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2344 {
2345         return -EOPNOTSUPP;
2346 }
2347 EXPORT_SYMBOL(sock_no_sendmsg);
2348
2349 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2350                     int flags)
2351 {
2352         return -EOPNOTSUPP;
2353 }
2354 EXPORT_SYMBOL(sock_no_recvmsg);
2355
2356 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2357 {
2358         /* Mirror missing mmap method error code */
2359         return -ENODEV;
2360 }
2361 EXPORT_SYMBOL(sock_no_mmap);
2362
2363 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2364 {
2365         ssize_t res;
2366         struct msghdr msg = {.msg_flags = flags};
2367         struct kvec iov;
2368         char *kaddr = kmap(page);
2369         iov.iov_base = kaddr + offset;
2370         iov.iov_len = size;
2371         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2372         kunmap(page);
2373         return res;
2374 }
2375 EXPORT_SYMBOL(sock_no_sendpage);
2376
2377 /*
2378  *      Default Socket Callbacks
2379  */
2380
2381 static void sock_def_wakeup(struct sock *sk)
2382 {
2383         struct socket_wq *wq;
2384
2385         rcu_read_lock();
2386         wq = rcu_dereference(sk->sk_wq);
2387         if (skwq_has_sleeper(wq))
2388                 wake_up_interruptible_all(&wq->wait);
2389         rcu_read_unlock();
2390 }
2391
2392 static void sock_def_error_report(struct sock *sk)
2393 {
2394         struct socket_wq *wq;
2395
2396         rcu_read_lock();
2397         wq = rcu_dereference(sk->sk_wq);
2398         if (skwq_has_sleeper(wq))
2399                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2400         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2401         rcu_read_unlock();
2402 }
2403
2404 static void sock_def_readable(struct sock *sk)
2405 {
2406         struct socket_wq *wq;
2407
2408         rcu_read_lock();
2409         wq = rcu_dereference(sk->sk_wq);
2410         if (skwq_has_sleeper(wq))
2411                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2412                                                 POLLRDNORM | POLLRDBAND);
2413         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2414         rcu_read_unlock();
2415 }
2416
2417 static void sock_def_write_space(struct sock *sk)
2418 {
2419         struct socket_wq *wq;
2420
2421         rcu_read_lock();
2422
2423         /* Do not wake up a writer until he can make "significant"
2424          * progress.  --DaveM
2425          */
2426         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2427                 wq = rcu_dereference(sk->sk_wq);
2428                 if (skwq_has_sleeper(wq))
2429                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2430                                                 POLLWRNORM | POLLWRBAND);
2431
2432                 /* Should agree with poll, otherwise some programs break */
2433                 if (sock_writeable(sk))
2434                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2435         }
2436
2437         rcu_read_unlock();
2438 }
2439
2440 static void sock_def_destruct(struct sock *sk)
2441 {
2442 }
2443
2444 void sk_send_sigurg(struct sock *sk)
2445 {
2446         if (sk->sk_socket && sk->sk_socket->file)
2447                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2448                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2449 }
2450 EXPORT_SYMBOL(sk_send_sigurg);
2451
2452 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2453                     unsigned long expires)
2454 {
2455         if (!mod_timer(timer, expires))
2456                 sock_hold(sk);
2457 }
2458 EXPORT_SYMBOL(sk_reset_timer);
2459
2460 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2461 {
2462         if (del_timer(timer))
2463                 __sock_put(sk);
2464 }
2465 EXPORT_SYMBOL(sk_stop_timer);
2466
2467 void sock_init_data(struct socket *sock, struct sock *sk)
2468 {
2469         skb_queue_head_init(&sk->sk_receive_queue);
2470         skb_queue_head_init(&sk->sk_write_queue);
2471         skb_queue_head_init(&sk->sk_error_queue);
2472
2473         sk->sk_send_head        =       NULL;
2474
2475         init_timer(&sk->sk_timer);
2476
2477         sk->sk_allocation       =       GFP_KERNEL;
2478         sk->sk_rcvbuf           =       sysctl_rmem_default;
2479         sk->sk_sndbuf           =       sysctl_wmem_default;
2480         sk->sk_state            =       TCP_CLOSE;
2481         sk_set_socket(sk, sock);
2482
2483         sock_set_flag(sk, SOCK_ZAPPED);
2484
2485         if (sock) {
2486                 sk->sk_type     =       sock->type;
2487                 sk->sk_wq       =       sock->wq;
2488                 sock->sk        =       sk;
2489                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2490         } else {
2491                 sk->sk_wq       =       NULL;
2492                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2493         }
2494
2495         rwlock_init(&sk->sk_callback_lock);
2496         if (sk->sk_kern_sock)
2497                 lockdep_set_class_and_name(
2498                         &sk->sk_callback_lock,
2499                         af_kern_callback_keys + sk->sk_family,
2500                         af_family_kern_clock_key_strings[sk->sk_family]);
2501         else
2502                 lockdep_set_class_and_name(
2503                         &sk->sk_callback_lock,
2504                         af_callback_keys + sk->sk_family,
2505                         af_family_clock_key_strings[sk->sk_family]);
2506
2507         sk->sk_state_change     =       sock_def_wakeup;
2508         sk->sk_data_ready       =       sock_def_readable;
2509         sk->sk_write_space      =       sock_def_write_space;
2510         sk->sk_error_report     =       sock_def_error_report;
2511         sk->sk_destruct         =       sock_def_destruct;
2512
2513         sk->sk_frag.page        =       NULL;
2514         sk->sk_frag.offset      =       0;
2515         sk->sk_peek_off         =       -1;
2516
2517         sk->sk_peer_pid         =       NULL;
2518         sk->sk_peer_cred        =       NULL;
2519         sk->sk_write_pending    =       0;
2520         sk->sk_rcvlowat         =       1;
2521         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2522         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2523
2524         sk->sk_stamp = ktime_set(-1L, 0);
2525
2526 #ifdef CONFIG_NET_RX_BUSY_POLL
2527         sk->sk_napi_id          =       0;
2528         sk->sk_ll_usec          =       sysctl_net_busy_read;
2529 #endif
2530
2531         sk->sk_max_pacing_rate = ~0U;
2532         sk->sk_pacing_rate = ~0U;
2533         sk->sk_incoming_cpu = -1;
2534         /*
2535          * Before updating sk_refcnt, we must commit prior changes to memory
2536          * (Documentation/RCU/rculist_nulls.txt for details)
2537          */
2538         smp_wmb();
2539         atomic_set(&sk->sk_refcnt, 1);
2540         atomic_set(&sk->sk_drops, 0);
2541 }
2542 EXPORT_SYMBOL(sock_init_data);
2543
2544 void lock_sock_nested(struct sock *sk, int subclass)
2545 {
2546         might_sleep();
2547         spin_lock_bh(&sk->sk_lock.slock);
2548         if (sk->sk_lock.owned)
2549                 __lock_sock(sk);
2550         sk->sk_lock.owned = 1;
2551         spin_unlock(&sk->sk_lock.slock);
2552         /*
2553          * The sk_lock has mutex_lock() semantics here:
2554          */
2555         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2556         local_bh_enable();
2557 }
2558 EXPORT_SYMBOL(lock_sock_nested);
2559
2560 void release_sock(struct sock *sk)
2561 {
2562         spin_lock_bh(&sk->sk_lock.slock);
2563         if (sk->sk_backlog.tail)
2564                 __release_sock(sk);
2565
2566         /* Warning : release_cb() might need to release sk ownership,
2567          * ie call sock_release_ownership(sk) before us.
2568          */
2569         if (sk->sk_prot->release_cb)
2570                 sk->sk_prot->release_cb(sk);
2571
2572         sock_release_ownership(sk);
2573         if (waitqueue_active(&sk->sk_lock.wq))
2574                 wake_up(&sk->sk_lock.wq);
2575         spin_unlock_bh(&sk->sk_lock.slock);
2576 }
2577 EXPORT_SYMBOL(release_sock);
2578
2579 /**
2580  * lock_sock_fast - fast version of lock_sock
2581  * @sk: socket
2582  *
2583  * This version should be used for very small section, where process wont block
2584  * return false if fast path is taken
2585  *   sk_lock.slock locked, owned = 0, BH disabled
2586  * return true if slow path is taken
2587  *   sk_lock.slock unlocked, owned = 1, BH enabled
2588  */
2589 bool lock_sock_fast(struct sock *sk)
2590 {
2591         might_sleep();
2592         spin_lock_bh(&sk->sk_lock.slock);
2593
2594         if (!sk->sk_lock.owned)
2595                 /*
2596                  * Note : We must disable BH
2597                  */
2598                 return false;
2599
2600         __lock_sock(sk);
2601         sk->sk_lock.owned = 1;
2602         spin_unlock(&sk->sk_lock.slock);
2603         /*
2604          * The sk_lock has mutex_lock() semantics here:
2605          */
2606         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2607         local_bh_enable();
2608         return true;
2609 }
2610 EXPORT_SYMBOL(lock_sock_fast);
2611
2612 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2613 {
2614         struct timeval tv;
2615         if (!sock_flag(sk, SOCK_TIMESTAMP))
2616                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2617         tv = ktime_to_timeval(sk->sk_stamp);
2618         if (tv.tv_sec == -1)
2619                 return -ENOENT;
2620         if (tv.tv_sec == 0) {
2621                 sk->sk_stamp = ktime_get_real();
2622                 tv = ktime_to_timeval(sk->sk_stamp);
2623         }
2624         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2625 }
2626 EXPORT_SYMBOL(sock_get_timestamp);
2627
2628 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2629 {
2630         struct timespec ts;
2631         if (!sock_flag(sk, SOCK_TIMESTAMP))
2632                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2633         ts = ktime_to_timespec(sk->sk_stamp);
2634         if (ts.tv_sec == -1)
2635                 return -ENOENT;
2636         if (ts.tv_sec == 0) {
2637                 sk->sk_stamp = ktime_get_real();
2638                 ts = ktime_to_timespec(sk->sk_stamp);
2639         }
2640         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2641 }
2642 EXPORT_SYMBOL(sock_get_timestampns);
2643
2644 void sock_enable_timestamp(struct sock *sk, int flag)
2645 {
2646         if (!sock_flag(sk, flag)) {
2647                 unsigned long previous_flags = sk->sk_flags;
2648
2649                 sock_set_flag(sk, flag);
2650                 /*
2651                  * we just set one of the two flags which require net
2652                  * time stamping, but time stamping might have been on
2653                  * already because of the other one
2654                  */
2655                 if (sock_needs_netstamp(sk) &&
2656                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2657                         net_enable_timestamp();
2658         }
2659 }
2660
2661 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2662                        int level, int type)
2663 {
2664         struct sock_exterr_skb *serr;
2665         struct sk_buff *skb;
2666         int copied, err;
2667
2668         err = -EAGAIN;
2669         skb = sock_dequeue_err_skb(sk);
2670         if (skb == NULL)
2671                 goto out;
2672
2673         copied = skb->len;
2674         if (copied > len) {
2675                 msg->msg_flags |= MSG_TRUNC;
2676                 copied = len;
2677         }
2678         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2679         if (err)
2680                 goto out_free_skb;
2681
2682         sock_recv_timestamp(msg, sk, skb);
2683
2684         serr = SKB_EXT_ERR(skb);
2685         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2686
2687         msg->msg_flags |= MSG_ERRQUEUE;
2688         err = copied;
2689
2690 out_free_skb:
2691         kfree_skb(skb);
2692 out:
2693         return err;
2694 }
2695 EXPORT_SYMBOL(sock_recv_errqueue);
2696
2697 /*
2698  *      Get a socket option on an socket.
2699  *
2700  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2701  *      asynchronous errors should be reported by getsockopt. We assume
2702  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2703  */
2704 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2705                            char __user *optval, int __user *optlen)
2706 {
2707         struct sock *sk = sock->sk;
2708
2709         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2710 }
2711 EXPORT_SYMBOL(sock_common_getsockopt);
2712
2713 #ifdef CONFIG_COMPAT
2714 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2715                                   char __user *optval, int __user *optlen)
2716 {
2717         struct sock *sk = sock->sk;
2718
2719         if (sk->sk_prot->compat_getsockopt != NULL)
2720                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2721                                                       optval, optlen);
2722         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2723 }
2724 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2725 #endif
2726
2727 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2728                         int flags)
2729 {
2730         struct sock *sk = sock->sk;
2731         int addr_len = 0;
2732         int err;
2733
2734         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2735                                    flags & ~MSG_DONTWAIT, &addr_len);
2736         if (err >= 0)
2737                 msg->msg_namelen = addr_len;
2738         return err;
2739 }
2740 EXPORT_SYMBOL(sock_common_recvmsg);
2741
2742 /*
2743  *      Set socket options on an inet socket.
2744  */
2745 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2746                            char __user *optval, unsigned int optlen)
2747 {
2748         struct sock *sk = sock->sk;
2749
2750         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2751 }
2752 EXPORT_SYMBOL(sock_common_setsockopt);
2753
2754 #ifdef CONFIG_COMPAT
2755 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2756                                   char __user *optval, unsigned int optlen)
2757 {
2758         struct sock *sk = sock->sk;
2759
2760         if (sk->sk_prot->compat_setsockopt != NULL)
2761                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2762                                                       optval, optlen);
2763         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2764 }
2765 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2766 #endif
2767
2768 void sk_common_release(struct sock *sk)
2769 {
2770         if (sk->sk_prot->destroy)
2771                 sk->sk_prot->destroy(sk);
2772
2773         /*
2774          * Observation: when sock_common_release is called, processes have
2775          * no access to socket. But net still has.
2776          * Step one, detach it from networking:
2777          *
2778          * A. Remove from hash tables.
2779          */
2780
2781         sk->sk_prot->unhash(sk);
2782
2783         /*
2784          * In this point socket cannot receive new packets, but it is possible
2785          * that some packets are in flight because some CPU runs receiver and
2786          * did hash table lookup before we unhashed socket. They will achieve
2787          * receive queue and will be purged by socket destructor.
2788          *
2789          * Also we still have packets pending on receive queue and probably,
2790          * our own packets waiting in device queues. sock_destroy will drain
2791          * receive queue, but transmitted packets will delay socket destruction
2792          * until the last reference will be released.
2793          */
2794
2795         sock_orphan(sk);
2796
2797         xfrm_sk_free_policy(sk);
2798
2799         sk_refcnt_debug_release(sk);
2800
2801         sock_put(sk);
2802 }
2803 EXPORT_SYMBOL(sk_common_release);
2804
2805 #ifdef CONFIG_PROC_FS
2806 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2807 struct prot_inuse {
2808         int val[PROTO_INUSE_NR];
2809 };
2810
2811 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2812
2813 #ifdef CONFIG_NET_NS
2814 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2815 {
2816         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2817 }
2818 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2819
2820 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2821 {
2822         int cpu, idx = prot->inuse_idx;
2823         int res = 0;
2824
2825         for_each_possible_cpu(cpu)
2826                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2827
2828         return res >= 0 ? res : 0;
2829 }
2830 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2831
2832 static int __net_init sock_inuse_init_net(struct net *net)
2833 {
2834         net->core.inuse = alloc_percpu(struct prot_inuse);
2835         return net->core.inuse ? 0 : -ENOMEM;
2836 }
2837
2838 static void __net_exit sock_inuse_exit_net(struct net *net)
2839 {
2840         free_percpu(net->core.inuse);
2841 }
2842
2843 static struct pernet_operations net_inuse_ops = {
2844         .init = sock_inuse_init_net,
2845         .exit = sock_inuse_exit_net,
2846 };
2847
2848 static __init int net_inuse_init(void)
2849 {
2850         if (register_pernet_subsys(&net_inuse_ops))
2851                 panic("Cannot initialize net inuse counters");
2852
2853         return 0;
2854 }
2855
2856 core_initcall(net_inuse_init);
2857 #else
2858 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2859
2860 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2861 {
2862         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2863 }
2864 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2865
2866 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2867 {
2868         int cpu, idx = prot->inuse_idx;
2869         int res = 0;
2870
2871         for_each_possible_cpu(cpu)
2872                 res += per_cpu(prot_inuse, cpu).val[idx];
2873
2874         return res >= 0 ? res : 0;
2875 }
2876 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2877 #endif
2878
2879 static void assign_proto_idx(struct proto *prot)
2880 {
2881         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2882
2883         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2884                 pr_err("PROTO_INUSE_NR exhausted\n");
2885                 return;
2886         }
2887
2888         set_bit(prot->inuse_idx, proto_inuse_idx);
2889 }
2890
2891 static void release_proto_idx(struct proto *prot)
2892 {
2893         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2894                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2895 }
2896 #else
2897 static inline void assign_proto_idx(struct proto *prot)
2898 {
2899 }
2900
2901 static inline void release_proto_idx(struct proto *prot)
2902 {
2903 }
2904 #endif
2905
2906 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2907 {
2908         if (!rsk_prot)
2909                 return;
2910         kfree(rsk_prot->slab_name);
2911         rsk_prot->slab_name = NULL;
2912         kmem_cache_destroy(rsk_prot->slab);
2913         rsk_prot->slab = NULL;
2914 }
2915
2916 static int req_prot_init(const struct proto *prot)
2917 {
2918         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2919
2920         if (!rsk_prot)
2921                 return 0;
2922
2923         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2924                                         prot->name);
2925         if (!rsk_prot->slab_name)
2926                 return -ENOMEM;
2927
2928         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2929                                            rsk_prot->obj_size, 0,
2930                                            prot->slab_flags, NULL);
2931
2932         if (!rsk_prot->slab) {
2933                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2934                         prot->name);
2935                 return -ENOMEM;
2936         }
2937         return 0;
2938 }
2939
2940 int proto_register(struct proto *prot, int alloc_slab)
2941 {
2942         if (alloc_slab) {
2943                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2944                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2945                                         NULL);
2946
2947                 if (prot->slab == NULL) {
2948                         pr_crit("%s: Can't create sock SLAB cache!\n",
2949                                 prot->name);
2950                         goto out;
2951                 }
2952
2953                 if (req_prot_init(prot))
2954                         goto out_free_request_sock_slab;
2955
2956                 if (prot->twsk_prot != NULL) {
2957                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2958
2959                         if (prot->twsk_prot->twsk_slab_name == NULL)
2960                                 goto out_free_request_sock_slab;
2961
2962                         prot->twsk_prot->twsk_slab =
2963                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2964                                                   prot->twsk_prot->twsk_obj_size,
2965                                                   0,
2966                                                   prot->slab_flags,
2967                                                   NULL);
2968                         if (prot->twsk_prot->twsk_slab == NULL)
2969                                 goto out_free_timewait_sock_slab_name;
2970                 }
2971         }
2972
2973         mutex_lock(&proto_list_mutex);
2974         list_add(&prot->node, &proto_list);
2975         assign_proto_idx(prot);
2976         mutex_unlock(&proto_list_mutex);
2977         return 0;
2978
2979 out_free_timewait_sock_slab_name:
2980         kfree(prot->twsk_prot->twsk_slab_name);
2981 out_free_request_sock_slab:
2982         req_prot_cleanup(prot->rsk_prot);
2983
2984         kmem_cache_destroy(prot->slab);
2985         prot->slab = NULL;
2986 out:
2987         return -ENOBUFS;
2988 }
2989 EXPORT_SYMBOL(proto_register);
2990
2991 void proto_unregister(struct proto *prot)
2992 {
2993         mutex_lock(&proto_list_mutex);
2994         release_proto_idx(prot);
2995         list_del(&prot->node);
2996         mutex_unlock(&proto_list_mutex);
2997
2998         kmem_cache_destroy(prot->slab);
2999         prot->slab = NULL;
3000
3001         req_prot_cleanup(prot->rsk_prot);
3002
3003         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3004                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3005                 kfree(prot->twsk_prot->twsk_slab_name);
3006                 prot->twsk_prot->twsk_slab = NULL;
3007         }
3008 }
3009 EXPORT_SYMBOL(proto_unregister);
3010
3011 #ifdef CONFIG_PROC_FS
3012 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3013         __acquires(proto_list_mutex)
3014 {
3015         mutex_lock(&proto_list_mutex);
3016         return seq_list_start_head(&proto_list, *pos);
3017 }
3018
3019 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3020 {
3021         return seq_list_next(v, &proto_list, pos);
3022 }
3023
3024 static void proto_seq_stop(struct seq_file *seq, void *v)
3025         __releases(proto_list_mutex)
3026 {
3027         mutex_unlock(&proto_list_mutex);
3028 }
3029
3030 static char proto_method_implemented(const void *method)
3031 {
3032         return method == NULL ? 'n' : 'y';
3033 }
3034 static long sock_prot_memory_allocated(struct proto *proto)
3035 {
3036         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3037 }
3038
3039 static char *sock_prot_memory_pressure(struct proto *proto)
3040 {
3041         return proto->memory_pressure != NULL ?
3042         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3043 }
3044
3045 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3046 {
3047
3048         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3049                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3050                    proto->name,
3051                    proto->obj_size,
3052                    sock_prot_inuse_get(seq_file_net(seq), proto),
3053                    sock_prot_memory_allocated(proto),
3054                    sock_prot_memory_pressure(proto),
3055                    proto->max_header,
3056                    proto->slab == NULL ? "no" : "yes",
3057                    module_name(proto->owner),
3058                    proto_method_implemented(proto->close),
3059                    proto_method_implemented(proto->connect),
3060                    proto_method_implemented(proto->disconnect),
3061                    proto_method_implemented(proto->accept),
3062                    proto_method_implemented(proto->ioctl),
3063                    proto_method_implemented(proto->init),
3064                    proto_method_implemented(proto->destroy),
3065                    proto_method_implemented(proto->shutdown),
3066                    proto_method_implemented(proto->setsockopt),
3067                    proto_method_implemented(proto->getsockopt),
3068                    proto_method_implemented(proto->sendmsg),
3069                    proto_method_implemented(proto->recvmsg),
3070                    proto_method_implemented(proto->sendpage),
3071                    proto_method_implemented(proto->bind),
3072                    proto_method_implemented(proto->backlog_rcv),
3073                    proto_method_implemented(proto->hash),
3074                    proto_method_implemented(proto->unhash),
3075                    proto_method_implemented(proto->get_port),
3076                    proto_method_implemented(proto->enter_memory_pressure));
3077 }
3078
3079 static int proto_seq_show(struct seq_file *seq, void *v)
3080 {
3081         if (v == &proto_list)
3082                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3083                            "protocol",
3084                            "size",
3085                            "sockets",
3086                            "memory",
3087                            "press",
3088                            "maxhdr",
3089                            "slab",
3090                            "module",
3091                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3092         else
3093                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3094         return 0;
3095 }
3096
3097 static const struct seq_operations proto_seq_ops = {
3098         .start  = proto_seq_start,
3099         .next   = proto_seq_next,
3100         .stop   = proto_seq_stop,
3101         .show   = proto_seq_show,
3102 };
3103
3104 static int proto_seq_open(struct inode *inode, struct file *file)
3105 {
3106         return seq_open_net(inode, file, &proto_seq_ops,
3107                             sizeof(struct seq_net_private));
3108 }
3109
3110 static const struct file_operations proto_seq_fops = {
3111         .owner          = THIS_MODULE,
3112         .open           = proto_seq_open,
3113         .read           = seq_read,
3114         .llseek         = seq_lseek,
3115         .release        = seq_release_net,
3116 };
3117
3118 static __net_init int proto_init_net(struct net *net)
3119 {
3120         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3121                 return -ENOMEM;
3122
3123         return 0;
3124 }
3125
3126 static __net_exit void proto_exit_net(struct net *net)
3127 {
3128         remove_proc_entry("protocols", net->proc_net);
3129 }
3130
3131
3132 static __net_initdata struct pernet_operations proto_net_ops = {
3133         .init = proto_init_net,
3134         .exit = proto_exit_net,
3135 };
3136
3137 static int __init proto_init(void)
3138 {
3139         return register_pernet_subsys(&proto_net_ops);
3140 }
3141
3142 subsys_initcall(proto_init);
3143
3144 #endif /* PROC_FS */