net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capability to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socket was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196 /*
 197  * Each address family might have different locking rules, so we have
 198  * one slock key per address family and separate keys for internal and
 199  * userspace sockets.
 200  */
 201 static struct lock_class_key af_family_keys[AF_MAX];
 202 static struct lock_class_key af_family_kern_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206 /*
 207  * Make lock validator output more readable. (we pre-construct these
 208  * strings build-time, so that runtime initialization of socket
 209  * locks is fast):
 210  */
 211
 212 #define _sock_locks(x)                                            \
 213   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 214   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 215   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 216   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 217   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 218   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 219   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 220   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 221   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 222   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 223   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 224   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 225   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 226   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 227   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 228
 229 static const char *const af_family_key_strings[AF_MAX+1] = {
 230         _sock_locks("sk_lock-")
 231 };
 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233         _sock_locks("slock-")
 234 };
 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236         _sock_locks("clock-")
 237 };
 238
 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240         _sock_locks("k-sk_lock-")
 241 };
 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-slock-")
 244 };
 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-clock-")
 247 };
 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264 };
 265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281 };
 282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock and sk queues locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305 static struct lock_class_key af_rlock_keys[AF_MAX];
 306 static struct lock_class_key af_wlock_keys[AF_MAX];
 307 static struct lock_class_key af_elock_keys[AF_MAX];
 308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310 /* Take into consideration the size of the struct sk_buff overhead in the
 311  * determination of these values, since that is non-constant across
 312  * platforms.  This makes socket queueing behavior and performance
 313  * not depend upon such differences.
 314  */
 315 #define _SK_MEM_PACKETS         256
 316 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 317 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 318 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 319
 320 /* Run time adjustable parameters. */
 321 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 322 EXPORT_SYMBOL(sysctl_wmem_max);
 323 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 324 EXPORT_SYMBOL(sysctl_rmem_max);
 325 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 326 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 327
 328 /* Maximal space eaten by iovec or ancillary data plus some space */
 329 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 330 EXPORT_SYMBOL(sysctl_optmem_max);
 331
 332 int sysctl_tstamp_allow_data __read_mostly = 1;
 333
 334 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 335 EXPORT_SYMBOL_GPL(memalloc_socks);
 336
 337 /**
 338  * sk_set_memalloc - sets %SOCK_MEMALLOC
 339  * @sk: socket to set it on
 340  *
 341  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 342  * It's the responsibility of the admin to adjust min_free_kbytes
 343  * to meet the requirements
 344  */
 345 void sk_set_memalloc(struct sock *sk)
 346 {
 347         sock_set_flag(sk, SOCK_MEMALLOC);
 348         sk->sk_allocation |= __GFP_MEMALLOC;
 349         static_key_slow_inc(&memalloc_socks);
 350 }
 351 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 352
 353 void sk_clear_memalloc(struct sock *sk)
 354 {
 355         sock_reset_flag(sk, SOCK_MEMALLOC);
 356         sk->sk_allocation &= ~__GFP_MEMALLOC;
 357         static_key_slow_dec(&memalloc_socks);
 358
 359         /*
 360          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 361          * progress of swapping. SOCK_MEMALLOC may be cleared while
 362          * it has rmem allocations due to the last swapfile being deactivated
 363          * but there is a risk that the socket is unusable due to exceeding
 364          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 365          */
 366         sk_mem_reclaim(sk);
 367 }
 368 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 369
 370 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 371 {
 372         int ret;
 373         unsigned int noreclaim_flag;
 374
 375         /* these should have been dropped before queueing */
 376         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 377
 378         noreclaim_flag = memalloc_noreclaim_save();
 379         ret = sk->sk_backlog_rcv(sk, skb);
 380         memalloc_noreclaim_restore(noreclaim_flag);
 381
 382         return ret;
 383 }
 384 EXPORT_SYMBOL(__sk_backlog_rcv);
 385
 386 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 387 {
 388         struct timeval tv;
 389
 390         if (optlen < sizeof(tv))
 391                 return -EINVAL;
 392         if (copy_from_user(&tv, optval, sizeof(tv)))
 393                 return -EFAULT;
 394         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                 return -EDOM;
 396
 397         if (tv.tv_sec < 0) {
 398                 static int warned __read_mostly;
 399
 400                 *timeo_p = 0;
 401                 if (warned < 10 && net_ratelimit()) {
 402                         warned++;
 403                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                 __func__, current->comm, task_pid_nr(current));
 405                 }
 406                 return 0;
 407         }
 408         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                 return 0;
 411         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 412                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 413         return 0;
 414 }
 415
 416 static void sock_warn_obsolete_bsdism(const char *name)
 417 {
 418         static int warned;
 419         static char warncomm[TASK_COMM_LEN];
 420         if (strcmp(warncomm, current->comm) && warned < 5) {
 421                 strcpy(warncomm,  current->comm);
 422                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 423                         warncomm, name);
 424                 warned++;
 425         }
 426 }
 427
 428 static bool sock_needs_netstamp(const struct sock *sk)
 429 {
 430         switch (sk->sk_family) {
 431         case AF_UNSPEC:
 432         case AF_UNIX:
 433                 return false;
 434         default:
 435                 return true;
 436         }
 437 }
 438
 439 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 440 {
 441         if (sk->sk_flags & flags) {
 442                 sk->sk_flags &= ~flags;
 443                 if (sock_needs_netstamp(sk) &&
 444                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 445                         net_disable_timestamp();
 446         }
 447 }
 448
 449
 450 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 451 {
 452         unsigned long flags;
 453         struct sk_buff_head *list = &sk->sk_receive_queue;
 454
 455         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 456                 atomic_inc(&sk->sk_drops);
 457                 trace_sock_rcvqueue_full(sk, skb);
 458                 return -ENOMEM;
 459         }
 460
 461         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 462                 atomic_inc(&sk->sk_drops);
 463                 return -ENOBUFS;
 464         }
 465
 466         skb->dev = NULL;
 467         skb_set_owner_r(skb, sk);
 468
 469         /* we escape from rcu protected region, make sure we dont leak
 470          * a norefcounted dst
 471          */
 472         skb_dst_force(skb);
 473
 474         spin_lock_irqsave(&list->lock, flags);
 475         sock_skb_set_dropcount(sk, skb);
 476         __skb_queue_tail(list, skb);
 477         spin_unlock_irqrestore(&list->lock, flags);
 478
 479         if (!sock_flag(sk, SOCK_DEAD))
 480                 sk->sk_data_ready(sk);
 481         return 0;
 482 }
 483 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 484
 485 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 486 {
 487         int err;
 488
 489         err = sk_filter(sk, skb);
 490         if (err)
 491                 return err;
 492
 493         return __sock_queue_rcv_skb(sk, skb);
 494 }
 495 EXPORT_SYMBOL(sock_queue_rcv_skb);
 496
 497 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 498                      const int nested, unsigned int trim_cap, bool refcounted)
 499 {
 500         int rc = NET_RX_SUCCESS;
 501
 502         if (sk_filter_trim_cap(sk, skb, trim_cap))
 503                 goto discard_and_relse;
 504
 505         skb->dev = NULL;
 506
 507         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 508                 atomic_inc(&sk->sk_drops);
 509                 goto discard_and_relse;
 510         }
 511         if (nested)
 512                 bh_lock_sock_nested(sk);
 513         else
 514                 bh_lock_sock(sk);
 515         if (!sock_owned_by_user(sk)) {
 516                 /*
 517                  * trylock + unlock semantics:
 518                  */
 519                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                 rc = sk_backlog_rcv(sk, skb);
 522
 523                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 524         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 525                 bh_unlock_sock(sk);
 526                 atomic_inc(&sk->sk_drops);
 527                 goto discard_and_relse;
 528         }
 529
 530         bh_unlock_sock(sk);
 531 out:
 532         if (refcounted)
 533                 sock_put(sk);
 534         return rc;
 535 discard_and_relse:
 536         kfree_skb(skb);
 537         goto out;
 538 }
 539 EXPORT_SYMBOL(__sk_receive_skb);
 540
 541 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 542 {
 543         struct dst_entry *dst = __sk_dst_get(sk);
 544
 545         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 546                 sk_tx_queue_clear(sk);
 547                 sk->sk_dst_pending_confirm = 0;
 548                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 549                 dst_release(dst);
 550                 return NULL;
 551         }
 552
 553         return dst;
 554 }
 555 EXPORT_SYMBOL(__sk_dst_check);
 556
 557 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 558 {
 559         struct dst_entry *dst = sk_dst_get(sk);
 560
 561         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 562                 sk_dst_reset(sk);
 563                 dst_release(dst);
 564                 return NULL;
 565         }
 566
 567         return dst;
 568 }
 569 EXPORT_SYMBOL(sk_dst_check);
 570
 571 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 572                                 int optlen)
 573 {
 574         int ret = -ENOPROTOOPT;
 575 #ifdef CONFIG_NETDEVICES
 576         struct net *net = sock_net(sk);
 577         char devname[IFNAMSIZ];
 578         int index;
 579
 580         /* Sorry... */
 581         ret = -EPERM;
 582         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 583                 goto out;
 584
 585         ret = -EINVAL;
 586         if (optlen < 0)
 587                 goto out;
 588
 589         /* Bind this socket to a particular device like "eth0",
 590          * as specified in the passed interface name. If the
 591          * name is "" or the option length is zero the socket
 592          * is not bound.
 593          */
 594         if (optlen > IFNAMSIZ - 1)
 595                 optlen = IFNAMSIZ - 1;
 596         memset(devname, 0, sizeof(devname));
 597
 598         ret = -EFAULT;
 599         if (copy_from_user(devname, optval, optlen))
 600                 goto out;
 601
 602         index = 0;
 603         if (devname[0] != '\0') {
 604                 struct net_device *dev;
 605
 606                 rcu_read_lock();
 607                 dev = dev_get_by_name_rcu(net, devname);
 608                 if (dev)
 609                         index = dev->ifindex;
 610                 rcu_read_unlock();
 611                 ret = -ENODEV;
 612                 if (!dev)
 613                         goto out;
 614         }
 615
 616         lock_sock(sk);
 617         sk->sk_bound_dev_if = index;
 618         sk_dst_reset(sk);
 619         release_sock(sk);
 620
 621         ret = 0;
 622
 623 out:
 624 #endif
 625
 626         return ret;
 627 }
 628
 629 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 630                                 int __user *optlen, int len)
 631 {
 632         int ret = -ENOPROTOOPT;
 633 #ifdef CONFIG_NETDEVICES
 634         struct net *net = sock_net(sk);
 635         char devname[IFNAMSIZ];
 636
 637         if (sk->sk_bound_dev_if == 0) {
 638                 len = 0;
 639                 goto zero;
 640         }
 641
 642         ret = -EINVAL;
 643         if (len < IFNAMSIZ)
 644                 goto out;
 645
 646         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 647         if (ret)
 648                 goto out;
 649
 650         len = strlen(devname) + 1;
 651
 652         ret = -EFAULT;
 653         if (copy_to_user(optval, devname, len))
 654                 goto out;
 655
 656 zero:
 657         ret = -EFAULT;
 658         if (put_user(len, optlen))
 659                 goto out;
 660
 661         ret = 0;
 662
 663 out:
 664 #endif
 665
 666         return ret;
 667 }
 668
 669 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 670 {
 671         if (valbool)
 672                 sock_set_flag(sk, bit);
 673         else
 674                 sock_reset_flag(sk, bit);
 675 }
 676
 677 bool sk_mc_loop(struct sock *sk)
 678 {
 679         if (dev_recursion_level())
 680                 return false;
 681         if (!sk)
 682                 return true;
 683         switch (sk->sk_family) {
 684         case AF_INET:
 685                 return inet_sk(sk)->mc_loop;
 686 #if IS_ENABLED(CONFIG_IPV6)
 687         case AF_INET6:
 688                 return inet6_sk(sk)->mc_loop;
 689 #endif
 690         }
 691         WARN_ON(1);
 692         return true;
 693 }
 694 EXPORT_SYMBOL(sk_mc_loop);
 695
 696 /*
 697  *      This is meant for all protocols to use and covers goings on
 698  *      at the socket level. Everything here is generic.
 699  */
 700
 701 int sock_setsockopt(struct socket *sock, int level, int optname,
 702                     char __user *optval, unsigned int optlen)
 703 {
 704         struct sock *sk = sock->sk;
 705         int val;
 706         int valbool;
 707         struct linger ling;
 708         int ret = 0;
 709
 710         /*
 711          *      Options without arguments
 712          */
 713
 714         if (optname == SO_BINDTODEVICE)
 715                 return sock_setbindtodevice(sk, optval, optlen);
 716
 717         if (optlen < sizeof(int))
 718                 return -EINVAL;
 719
 720         if (get_user(val, (int __user *)optval))
 721                 return -EFAULT;
 722
 723         valbool = val ? 1 : 0;
 724
 725         lock_sock(sk);
 726
 727         switch (optname) {
 728         case SO_DEBUG:
 729                 if (val && !capable(CAP_NET_ADMIN))
 730                         ret = -EACCES;
 731                 else
 732                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 733                 break;
 734         case SO_REUSEADDR:
 735                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 736                 break;
 737         case SO_REUSEPORT:
 738                 sk->sk_reuseport = valbool;
 739                 break;
 740         case SO_TYPE:
 741         case SO_PROTOCOL:
 742         case SO_DOMAIN:
 743         case SO_ERROR:
 744                 ret = -ENOPROTOOPT;
 745                 break;
 746         case SO_DONTROUTE:
 747                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 748                 break;
 749         case SO_BROADCAST:
 750                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 751                 break;
 752         case SO_SNDBUF:
 753                 /* Don't error on this BSD doesn't and if you think
 754                  * about it this is right. Otherwise apps have to
 755                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 756                  * are treated in BSD as hints
 757                  */
 758                 val = min_t(u32, val, sysctl_wmem_max);
 759 set_sndbuf:
 760                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 761                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 762                 /* Wake up sending tasks if we upped the value. */
 763                 sk->sk_write_space(sk);
 764                 break;
 765
 766         case SO_SNDBUFFORCE:
 767                 if (!capable(CAP_NET_ADMIN)) {
 768                         ret = -EPERM;
 769                         break;
 770                 }
 771                 goto set_sndbuf;
 772
 773         case SO_RCVBUF:
 774                 /* Don't error on this BSD doesn't and if you think
 775                  * about it this is right. Otherwise apps have to
 776                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777                  * are treated in BSD as hints
 778                  */
 779                 val = min_t(u32, val, sysctl_rmem_max);
 780 set_rcvbuf:
 781                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 782                 /*
 783                  * We double it on the way in to account for
 784                  * "struct sk_buff" etc. overhead.   Applications
 785                  * assume that the SO_RCVBUF setting they make will
 786                  * allow that much actual data to be received on that
 787                  * socket.
 788                  *
 789                  * Applications are unaware that "struct sk_buff" and
 790                  * other overheads allocate from the receive buffer
 791                  * during socket buffer allocation.
 792                  *
 793                  * And after considering the possible alternatives,
 794                  * returning the value we actually used in getsockopt
 795                  * is the most desirable behavior.
 796                  */
 797                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 798                 break;
 799
 800         case SO_RCVBUFFORCE:
 801                 if (!capable(CAP_NET_ADMIN)) {
 802                         ret = -EPERM;
 803                         break;
 804                 }
 805                 goto set_rcvbuf;
 806
 807         case SO_KEEPALIVE:
 808                 if (sk->sk_prot->keepalive)
 809                         sk->sk_prot->keepalive(sk, valbool);
 810                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 811                 break;
 812
 813         case SO_OOBINLINE:
 814                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 815                 break;
 816
 817         case SO_NO_CHECK:
 818                 sk->sk_no_check_tx = valbool;
 819                 break;
 820
 821         case SO_PRIORITY:
 822                 if ((val >= 0 && val <= 6) ||
 823                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 824                         sk->sk_priority = val;
 825                 else
 826                         ret = -EPERM;
 827                 break;
 828
 829         case SO_LINGER:
 830                 if (optlen < sizeof(ling)) {
 831                         ret = -EINVAL;  /* 1003.1g */
 832                         break;
 833                 }
 834                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 835                         ret = -EFAULT;
 836                         break;
 837                 }
 838                 if (!ling.l_onoff)
 839                         sock_reset_flag(sk, SOCK_LINGER);
 840                 else {
 841 #if (BITS_PER_LONG == 32)
 842                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 843                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 844                         else
 845 #endif
 846                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 847                         sock_set_flag(sk, SOCK_LINGER);
 848                 }
 849                 break;
 850
 851         case SO_BSDCOMPAT:
 852                 sock_warn_obsolete_bsdism("setsockopt");
 853                 break;
 854
 855         case SO_PASSCRED:
 856                 if (valbool)
 857                         set_bit(SOCK_PASSCRED, &sock->flags);
 858                 else
 859                         clear_bit(SOCK_PASSCRED, &sock->flags);
 860                 break;
 861
 862         case SO_TIMESTAMP:
 863         case SO_TIMESTAMPNS:
 864                 if (valbool)  {
 865                         if (optname == SO_TIMESTAMP)
 866                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 867                         else
 868                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 869                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 870                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 871                 } else {
 872                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 873                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 874                 }
 875                 break;
 876
 877         case SO_TIMESTAMPING:
 878                 if (val & ~SOF_TIMESTAMPING_MASK) {
 879                         ret = -EINVAL;
 880                         break;
 881                 }
 882
 883                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 884                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 885                         if (sk->sk_protocol == IPPROTO_TCP &&
 886                             sk->sk_type == SOCK_STREAM) {
 887                                 if ((1 << sk->sk_state) &
 888                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 889                                         ret = -EINVAL;
 890                                         break;
 891                                 }
 892                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 893                         } else {
 894                                 sk->sk_tskey = 0;
 895                         }
 896                 }
 897
 898                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 899                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 900                         ret = -EINVAL;
 901                         break;
 902                 }
 903
 904                 sk->sk_tsflags = val;
 905                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 906                         sock_enable_timestamp(sk,
 907                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 908                 else
 909                         sock_disable_timestamp(sk,
 910                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 911                 break;
 912
 913         case SO_RCVLOWAT:
 914                 if (val < 0)
 915                         val = INT_MAX;
 916                 sk->sk_rcvlowat = val ? : 1;
 917                 break;
 918
 919         case SO_RCVTIMEO:
 920                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 921                 break;
 922
 923         case SO_SNDTIMEO:
 924                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 925                 break;
 926
 927         case SO_ATTACH_FILTER:
 928                 ret = -EINVAL;
 929                 if (optlen == sizeof(struct sock_fprog)) {
 930                         struct sock_fprog fprog;
 931
 932                         ret = -EFAULT;
 933                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 934                                 break;
 935
 936                         ret = sk_attach_filter(&fprog, sk);
 937                 }
 938                 break;
 939
 940         case SO_ATTACH_BPF:
 941                 ret = -EINVAL;
 942                 if (optlen == sizeof(u32)) {
 943                         u32 ufd;
 944
 945                         ret = -EFAULT;
 946                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 947                                 break;
 948
 949                         ret = sk_attach_bpf(ufd, sk);
 950                 }
 951                 break;
 952
 953         case SO_ATTACH_REUSEPORT_CBPF:
 954                 ret = -EINVAL;
 955                 if (optlen == sizeof(struct sock_fprog)) {
 956                         struct sock_fprog fprog;
 957
 958                         ret = -EFAULT;
 959                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 960                                 break;
 961
 962                         ret = sk_reuseport_attach_filter(&fprog, sk);
 963                 }
 964                 break;
 965
 966         case SO_ATTACH_REUSEPORT_EBPF:
 967                 ret = -EINVAL;
 968                 if (optlen == sizeof(u32)) {
 969                         u32 ufd;
 970
 971                         ret = -EFAULT;
 972                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 973                                 break;
 974
 975                         ret = sk_reuseport_attach_bpf(ufd, sk);
 976                 }
 977                 break;
 978
 979         case SO_DETACH_FILTER:
 980                 ret = sk_detach_filter(sk);
 981                 break;
 982
 983         case SO_LOCK_FILTER:
 984                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 985                         ret = -EPERM;
 986                 else
 987                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 988                 break;
 989
 990         case SO_PASSSEC:
 991                 if (valbool)
 992                         set_bit(SOCK_PASSSEC, &sock->flags);
 993                 else
 994                         clear_bit(SOCK_PASSSEC, &sock->flags);
 995                 break;
 996         case SO_MARK:
 997                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 998                         ret = -EPERM;
 999                 else
1000                         sk->sk_mark = val;
1001                 break;
1002
1003         case SO_RXQ_OVFL:
1004                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005                 break;
1006
1007         case SO_WIFI_STATUS:
1008                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009                 break;
1010
1011         case SO_PEEK_OFF:
1012                 if (sock->ops->set_peek_off)
1013                         ret = sock->ops->set_peek_off(sk, val);
1014                 else
1015                         ret = -EOPNOTSUPP;
1016                 break;
1017
1018         case SO_NOFCS:
1019                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020                 break;
1021
1022         case SO_SELECT_ERR_QUEUE:
1023                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024                 break;
1025
1026 #ifdef CONFIG_NET_RX_BUSY_POLL
1027         case SO_BUSY_POLL:
1028                 /* allow unprivileged users to decrease the value */
1029                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030                         ret = -EPERM;
1031                 else {
1032                         if (val < 0)
1033                                 ret = -EINVAL;
1034                         else
1035                                 sk->sk_ll_usec = val;
1036                 }
1037                 break;
1038 #endif
1039
1040         case SO_MAX_PACING_RATE:
1041                 if (val != ~0U)
1042                         cmpxchg(&sk->sk_pacing_status,
1043                                 SK_PACING_NONE,
1044                                 SK_PACING_NEEDED);
1045                 sk->sk_max_pacing_rate = val;
1046                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1047                                          sk->sk_max_pacing_rate);
1048                 break;
1049
1050         case SO_INCOMING_CPU:
1051                 sk->sk_incoming_cpu = val;
1052                 break;
1053
1054         case SO_CNX_ADVICE:
1055                 if (val == 1)
1056                         dst_negative_advice(sk);
1057                 break;
1058
1059         case SO_ZEROCOPY:
1060                 if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1061                         ret = -ENOTSUPP;
1062                 else if (sk->sk_protocol != IPPROTO_TCP)
1063                         ret = -ENOTSUPP;
1064                 else if (sk->sk_state != TCP_CLOSE)
1065                         ret = -EBUSY;
1066                 else if (val < 0 || val > 1)
1067                         ret = -EINVAL;
1068                 else
1069                         sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1070                 break;
1071
1072         default:
1073                 ret = -ENOPROTOOPT;
1074                 break;
1075         }
1076         release_sock(sk);
1077         return ret;
1078 }
1079 EXPORT_SYMBOL(sock_setsockopt);
1080
1081
1082 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1083                           struct ucred *ucred)
1084 {
1085         ucred->pid = pid_vnr(pid);
1086         ucred->uid = ucred->gid = -1;
1087         if (cred) {
1088                 struct user_namespace *current_ns = current_user_ns();
1089
1090                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1091                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1092         }
1093 }
1094
1095 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1096 {
1097         struct user_namespace *user_ns = current_user_ns();
1098         int i;
1099
1100         for (i = 0; i < src->ngroups; i++)
1101                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1102                         return -EFAULT;
1103
1104         return 0;
1105 }
1106
1107 int sock_getsockopt(struct socket *sock, int level, int optname,
1108                     char __user *optval, int __user *optlen)
1109 {
1110         struct sock *sk = sock->sk;
1111
1112         union {
1113                 int val;
1114                 u64 val64;
1115                 struct linger ling;
1116                 struct timeval tm;
1117         } v;
1118
1119         int lv = sizeof(int);
1120         int len;
1121
1122         if (get_user(len, optlen))
1123                 return -EFAULT;
1124         if (len < 0)
1125                 return -EINVAL;
1126
1127         memset(&v, 0, sizeof(v));
1128
1129         switch (optname) {
1130         case SO_DEBUG:
1131                 v.val = sock_flag(sk, SOCK_DBG);
1132                 break;
1133
1134         case SO_DONTROUTE:
1135                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1136                 break;
1137
1138         case SO_BROADCAST:
1139                 v.val = sock_flag(sk, SOCK_BROADCAST);
1140                 break;
1141
1142         case SO_SNDBUF:
1143                 v.val = sk->sk_sndbuf;
1144                 break;
1145
1146         case SO_RCVBUF:
1147                 v.val = sk->sk_rcvbuf;
1148                 break;
1149
1150         case SO_REUSEADDR:
1151                 v.val = sk->sk_reuse;
1152                 break;
1153
1154         case SO_REUSEPORT:
1155                 v.val = sk->sk_reuseport;
1156                 break;
1157
1158         case SO_KEEPALIVE:
1159                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1160                 break;
1161
1162         case SO_TYPE:
1163                 v.val = sk->sk_type;
1164                 break;
1165
1166         case SO_PROTOCOL:
1167                 v.val = sk->sk_protocol;
1168                 break;
1169
1170         case SO_DOMAIN:
1171                 v.val = sk->sk_family;
1172                 break;
1173
1174         case SO_ERROR:
1175                 v.val = -sock_error(sk);
1176                 if (v.val == 0)
1177                         v.val = xchg(&sk->sk_err_soft, 0);
1178                 break;
1179
1180         case SO_OOBINLINE:
1181                 v.val = sock_flag(sk, SOCK_URGINLINE);
1182                 break;
1183
1184         case SO_NO_CHECK:
1185                 v.val = sk->sk_no_check_tx;
1186                 break;
1187
1188         case SO_PRIORITY:
1189                 v.val = sk->sk_priority;
1190                 break;
1191
1192         case SO_LINGER:
1193                 lv              = sizeof(v.ling);
1194                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1195                 v.ling.l_linger = sk->sk_lingertime / HZ;
1196                 break;
1197
1198         case SO_BSDCOMPAT:
1199                 sock_warn_obsolete_bsdism("getsockopt");
1200                 break;
1201
1202         case SO_TIMESTAMP:
1203                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1204                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1205                 break;
1206
1207         case SO_TIMESTAMPNS:
1208                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1209                 break;
1210
1211         case SO_TIMESTAMPING:
1212                 v.val = sk->sk_tsflags;
1213                 break;
1214
1215         case SO_RCVTIMEO:
1216                 lv = sizeof(struct timeval);
1217                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1218                         v.tm.tv_sec = 0;
1219                         v.tm.tv_usec = 0;
1220                 } else {
1221                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1222                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1223                 }
1224                 break;
1225
1226         case SO_SNDTIMEO:
1227                 lv = sizeof(struct timeval);
1228                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1229                         v.tm.tv_sec = 0;
1230                         v.tm.tv_usec = 0;
1231                 } else {
1232                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1233                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1234                 }
1235                 break;
1236
1237         case SO_RCVLOWAT:
1238                 v.val = sk->sk_rcvlowat;
1239                 break;
1240
1241         case SO_SNDLOWAT:
1242                 v.val = 1;
1243                 break;
1244
1245         case SO_PASSCRED:
1246                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1247                 break;
1248
1249         case SO_PEERCRED:
1250         {
1251                 struct ucred peercred;
1252                 if (len > sizeof(peercred))
1253                         len = sizeof(peercred);
1254                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1255                 if (copy_to_user(optval, &peercred, len))
1256                         return -EFAULT;
1257                 goto lenout;
1258         }
1259
1260         case SO_PEERGROUPS:
1261         {
1262                 int ret, n;
1263
1264                 if (!sk->sk_peer_cred)
1265                         return -ENODATA;
1266
1267                 n = sk->sk_peer_cred->group_info->ngroups;
1268                 if (len < n * sizeof(gid_t)) {
1269                         len = n * sizeof(gid_t);
1270                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1271                 }
1272                 len = n * sizeof(gid_t);
1273
1274                 ret = groups_to_user((gid_t __user *)optval,
1275                                      sk->sk_peer_cred->group_info);
1276                 if (ret)
1277                         return ret;
1278                 goto lenout;
1279         }
1280
1281         case SO_PEERNAME:
1282         {
1283                 char address[128];
1284
1285                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1286                         return -ENOTCONN;
1287                 if (lv < len)
1288                         return -EINVAL;
1289                 if (copy_to_user(optval, address, len))
1290                         return -EFAULT;
1291                 goto lenout;
1292         }
1293
1294         /* Dubious BSD thing... Probably nobody even uses it, but
1295          * the UNIX standard wants it for whatever reason... -DaveM
1296          */
1297         case SO_ACCEPTCONN:
1298                 v.val = sk->sk_state == TCP_LISTEN;
1299                 break;
1300
1301         case SO_PASSSEC:
1302                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1303                 break;
1304
1305         case SO_PEERSEC:
1306                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1307
1308         case SO_MARK:
1309                 v.val = sk->sk_mark;
1310                 break;
1311
1312         case SO_RXQ_OVFL:
1313                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1314                 break;
1315
1316         case SO_WIFI_STATUS:
1317                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1318                 break;
1319
1320         case SO_PEEK_OFF:
1321                 if (!sock->ops->set_peek_off)
1322                         return -EOPNOTSUPP;
1323
1324                 v.val = sk->sk_peek_off;
1325                 break;
1326         case SO_NOFCS:
1327                 v.val = sock_flag(sk, SOCK_NOFCS);
1328                 break;
1329
1330         case SO_BINDTODEVICE:
1331                 return sock_getbindtodevice(sk, optval, optlen, len);
1332
1333         case SO_GET_FILTER:
1334                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1335                 if (len < 0)
1336                         return len;
1337
1338                 goto lenout;
1339
1340         case SO_LOCK_FILTER:
1341                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1342                 break;
1343
1344         case SO_BPF_EXTENSIONS:
1345                 v.val = bpf_tell_extensions();
1346                 break;
1347
1348         case SO_SELECT_ERR_QUEUE:
1349                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1350                 break;
1351
1352 #ifdef CONFIG_NET_RX_BUSY_POLL
1353         case SO_BUSY_POLL:
1354                 v.val = sk->sk_ll_usec;
1355                 break;
1356 #endif
1357
1358         case SO_MAX_PACING_RATE:
1359                 v.val = sk->sk_max_pacing_rate;
1360                 break;
1361
1362         case SO_INCOMING_CPU:
1363                 v.val = sk->sk_incoming_cpu;
1364                 break;
1365
1366         case SO_MEMINFO:
1367         {
1368                 u32 meminfo[SK_MEMINFO_VARS];
1369
1370                 if (get_user(len, optlen))
1371                         return -EFAULT;
1372
1373                 sk_get_meminfo(sk, meminfo);
1374
1375                 len = min_t(unsigned int, len, sizeof(meminfo));
1376                 if (copy_to_user(optval, &meminfo, len))
1377                         return -EFAULT;
1378
1379                 goto lenout;
1380         }
1381
1382 #ifdef CONFIG_NET_RX_BUSY_POLL
1383         case SO_INCOMING_NAPI_ID:
1384                 v.val = READ_ONCE(sk->sk_napi_id);
1385
1386                 /* aggregate non-NAPI IDs down to 0 */
1387                 if (v.val < MIN_NAPI_ID)
1388                         v.val = 0;
1389
1390                 break;
1391 #endif
1392
1393         case SO_COOKIE:
1394                 lv = sizeof(u64);
1395                 if (len < lv)
1396                         return -EINVAL;
1397                 v.val64 = sock_gen_cookie(sk);
1398                 break;
1399
1400         case SO_ZEROCOPY:
1401                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1402                 break;
1403
1404         default:
1405                 /* We implement the SO_SNDLOWAT etc to not be settable
1406                  * (1003.1g 7).
1407                  */
1408                 return -ENOPROTOOPT;
1409         }
1410
1411         if (len > lv)
1412                 len = lv;
1413         if (copy_to_user(optval, &v, len))
1414                 return -EFAULT;
1415 lenout:
1416         if (put_user(len, optlen))
1417                 return -EFAULT;
1418         return 0;
1419 }
1420
1421 /*
1422  * Initialize an sk_lock.
1423  *
1424  * (We also register the sk_lock with the lock validator.)
1425  */
1426 static inline void sock_lock_init(struct sock *sk)
1427 {
1428         if (sk->sk_kern_sock)
1429                 sock_lock_init_class_and_name(
1430                         sk,
1431                         af_family_kern_slock_key_strings[sk->sk_family],
1432                         af_family_kern_slock_keys + sk->sk_family,
1433                         af_family_kern_key_strings[sk->sk_family],
1434                         af_family_kern_keys + sk->sk_family);
1435         else
1436                 sock_lock_init_class_and_name(
1437                         sk,
1438                         af_family_slock_key_strings[sk->sk_family],
1439                         af_family_slock_keys + sk->sk_family,
1440                         af_family_key_strings[sk->sk_family],
1441                         af_family_keys + sk->sk_family);
1442 }
1443
1444 /*
1445  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1446  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1447  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1448  */
1449 static void sock_copy(struct sock *nsk, const struct sock *osk)
1450 {
1451 #ifdef CONFIG_SECURITY_NETWORK
1452         void *sptr = nsk->sk_security;
1453 #endif
1454         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1455
1456         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1457                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1458
1459 #ifdef CONFIG_SECURITY_NETWORK
1460         nsk->sk_security = sptr;
1461         security_sk_clone(osk, nsk);
1462 #endif
1463 }
1464
1465 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1466                 int family)
1467 {
1468         struct sock *sk;
1469         struct kmem_cache *slab;
1470
1471         slab = prot->slab;
1472         if (slab != NULL) {
1473                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1474                 if (!sk)
1475                         return sk;
1476                 if (priority & __GFP_ZERO)
1477                         sk_prot_clear_nulls(sk, prot->obj_size);
1478         } else
1479                 sk = kmalloc(prot->obj_size, priority);
1480
1481         if (sk != NULL) {
1482                 kmemcheck_annotate_bitfield(sk, flags);
1483
1484                 if (security_sk_alloc(sk, family, priority))
1485                         goto out_free;
1486
1487                 if (!try_module_get(prot->owner))
1488                         goto out_free_sec;
1489                 sk_tx_queue_clear(sk);
1490         }
1491
1492         return sk;
1493
1494 out_free_sec:
1495         security_sk_free(sk);
1496 out_free:
1497         if (slab != NULL)
1498                 kmem_cache_free(slab, sk);
1499         else
1500                 kfree(sk);
1501         return NULL;
1502 }
1503
1504 static void sk_prot_free(struct proto *prot, struct sock *sk)
1505 {
1506         struct kmem_cache *slab;
1507         struct module *owner;
1508
1509         owner = prot->owner;
1510         slab = prot->slab;
1511
1512         cgroup_sk_free(&sk->sk_cgrp_data);
1513         mem_cgroup_sk_free(sk);
1514         security_sk_free(sk);
1515         if (slab != NULL)
1516                 kmem_cache_free(slab, sk);
1517         else
1518                 kfree(sk);
1519         module_put(owner);
1520 }
1521
1522 /**
1523  *      sk_alloc - All socket objects are allocated here
1524  *      @net: the applicable net namespace
1525  *      @family: protocol family
1526  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1527  *      @prot: struct proto associated with this new sock instance
1528  *      @kern: is this to be a kernel socket?
1529  */
1530 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1531                       struct proto *prot, int kern)
1532 {
1533         struct sock *sk;
1534
1535         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1536         if (sk) {
1537                 sk->sk_family = family;
1538                 /*
1539                  * See comment in struct sock definition to understand
1540                  * why we need sk_prot_creator -acme
1541                  */
1542                 sk->sk_prot = sk->sk_prot_creator = prot;
1543                 sk->sk_kern_sock = kern;
1544                 sock_lock_init(sk);
1545                 sk->sk_net_refcnt = kern ? 0 : 1;
1546                 if (likely(sk->sk_net_refcnt))
1547                         get_net(net);
1548                 sock_net_set(sk, net);
1549                 refcount_set(&sk->sk_wmem_alloc, 1);
1550
1551                 mem_cgroup_sk_alloc(sk);
1552                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1553                 sock_update_classid(&sk->sk_cgrp_data);
1554                 sock_update_netprioidx(&sk->sk_cgrp_data);
1555         }
1556
1557         return sk;
1558 }
1559 EXPORT_SYMBOL(sk_alloc);
1560
1561 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1562  * grace period. This is the case for UDP sockets and TCP listeners.
1563  */
1564 static void __sk_destruct(struct rcu_head *head)
1565 {
1566         struct sock *sk = container_of(head, struct sock, sk_rcu);
1567         struct sk_filter *filter;
1568
1569         if (sk->sk_destruct)
1570                 sk->sk_destruct(sk);
1571
1572         filter = rcu_dereference_check(sk->sk_filter,
1573                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1574         if (filter) {
1575                 sk_filter_uncharge(sk, filter);
1576                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1577         }
1578         if (rcu_access_pointer(sk->sk_reuseport_cb))
1579                 reuseport_detach_sock(sk);
1580
1581         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1582
1583         if (atomic_read(&sk->sk_omem_alloc))
1584                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1585                          __func__, atomic_read(&sk->sk_omem_alloc));
1586
1587         if (sk->sk_frag.page) {
1588                 put_page(sk->sk_frag.page);
1589                 sk->sk_frag.page = NULL;
1590         }
1591
1592         if (sk->sk_peer_cred)
1593                 put_cred(sk->sk_peer_cred);
1594         put_pid(sk->sk_peer_pid);
1595         if (likely(sk->sk_net_refcnt))
1596                 put_net(sock_net(sk));
1597         sk_prot_free(sk->sk_prot_creator, sk);
1598 }
1599
1600 void sk_destruct(struct sock *sk)
1601 {
1602         if (sock_flag(sk, SOCK_RCU_FREE))
1603                 call_rcu(&sk->sk_rcu, __sk_destruct);
1604         else
1605                 __sk_destruct(&sk->sk_rcu);
1606 }
1607
1608 static void __sk_free(struct sock *sk)
1609 {
1610         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1611                 sock_diag_broadcast_destroy(sk);
1612         else
1613                 sk_destruct(sk);
1614 }
1615
1616 void sk_free(struct sock *sk)
1617 {
1618         /*
1619          * We subtract one from sk_wmem_alloc and can know if
1620          * some packets are still in some tx queue.
1621          * If not null, sock_wfree() will call __sk_free(sk) later
1622          */
1623         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1624                 __sk_free(sk);
1625 }
1626 EXPORT_SYMBOL(sk_free);
1627
1628 static void sk_init_common(struct sock *sk)
1629 {
1630         skb_queue_head_init(&sk->sk_receive_queue);
1631         skb_queue_head_init(&sk->sk_write_queue);
1632         skb_queue_head_init(&sk->sk_error_queue);
1633
1634         rwlock_init(&sk->sk_callback_lock);
1635         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1636                         af_rlock_keys + sk->sk_family,
1637                         af_family_rlock_key_strings[sk->sk_family]);
1638         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1639                         af_wlock_keys + sk->sk_family,
1640                         af_family_wlock_key_strings[sk->sk_family]);
1641         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1642                         af_elock_keys + sk->sk_family,
1643                         af_family_elock_key_strings[sk->sk_family]);
1644         lockdep_set_class_and_name(&sk->sk_callback_lock,
1645                         af_callback_keys + sk->sk_family,
1646                         af_family_clock_key_strings[sk->sk_family]);
1647 }
1648
1649 /**
1650  *      sk_clone_lock - clone a socket, and lock its clone
1651  *      @sk: the socket to clone
1652  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1653  *
1654  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1655  */
1656 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1657 {
1658         struct sock *newsk;
1659         bool is_charged = true;
1660
1661         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1662         if (newsk != NULL) {
1663                 struct sk_filter *filter;
1664
1665                 sock_copy(newsk, sk);
1666
1667                 /* SANITY */
1668                 if (likely(newsk->sk_net_refcnt))
1669                         get_net(sock_net(newsk));
1670                 sk_node_init(&newsk->sk_node);
1671                 sock_lock_init(newsk);
1672                 bh_lock_sock(newsk);
1673                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1674                 newsk->sk_backlog.len = 0;
1675
1676                 atomic_set(&newsk->sk_rmem_alloc, 0);
1677                 /*
1678                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1679                  */
1680                 refcount_set(&newsk->sk_wmem_alloc, 1);
1681                 atomic_set(&newsk->sk_omem_alloc, 0);
1682                 sk_init_common(newsk);
1683
1684                 newsk->sk_dst_cache     = NULL;
1685                 newsk->sk_dst_pending_confirm = 0;
1686                 newsk->sk_wmem_queued   = 0;
1687                 newsk->sk_forward_alloc = 0;
1688                 atomic_set(&newsk->sk_drops, 0);
1689                 newsk->sk_send_head     = NULL;
1690                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1691                 atomic_set(&newsk->sk_zckey, 0);
1692
1693                 sock_reset_flag(newsk, SOCK_DONE);
1694
1695                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1696                 if (filter != NULL)
1697                         /* though it's an empty new sock, the charging may fail
1698                          * if sysctl_optmem_max was changed between creation of
1699                          * original socket and cloning
1700                          */
1701                         is_charged = sk_filter_charge(newsk, filter);
1702
1703                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1704                         /* We need to make sure that we don't uncharge the new
1705                          * socket if we couldn't charge it in the first place
1706                          * as otherwise we uncharge the parent's filter.
1707                          */
1708                         if (!is_charged)
1709                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1710                         sk_free_unlock_clone(newsk);
1711                         newsk = NULL;
1712                         goto out;
1713                 }
1714                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1715
1716                 newsk->sk_err      = 0;
1717                 newsk->sk_err_soft = 0;
1718                 newsk->sk_priority = 0;
1719                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1720                 atomic64_set(&newsk->sk_cookie, 0);
1721
1722                 mem_cgroup_sk_alloc(newsk);
1723                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1724
1725                 /*
1726                  * Before updating sk_refcnt, we must commit prior changes to memory
1727                  * (Documentation/RCU/rculist_nulls.txt for details)
1728                  */
1729                 smp_wmb();
1730                 refcount_set(&newsk->sk_refcnt, 2);
1731
1732                 /*
1733                  * Increment the counter in the same struct proto as the master
1734                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1735                  * is the same as sk->sk_prot->socks, as this field was copied
1736                  * with memcpy).
1737                  *
1738                  * This _changes_ the previous behaviour, where
1739                  * tcp_create_openreq_child always was incrementing the
1740                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1741                  * to be taken into account in all callers. -acme
1742                  */
1743                 sk_refcnt_debug_inc(newsk);
1744                 sk_set_socket(newsk, NULL);
1745                 newsk->sk_wq = NULL;
1746
1747                 if (newsk->sk_prot->sockets_allocated)
1748                         sk_sockets_allocated_inc(newsk);
1749
1750                 if (sock_needs_netstamp(sk) &&
1751                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1752                         net_enable_timestamp();
1753         }
1754 out:
1755         return newsk;
1756 }
1757 EXPORT_SYMBOL_GPL(sk_clone_lock);
1758
1759 void sk_free_unlock_clone(struct sock *sk)
1760 {
1761         /* It is still raw copy of parent, so invalidate
1762          * destructor and make plain sk_free() */
1763         sk->sk_destruct = NULL;
1764         bh_unlock_sock(sk);
1765         sk_free(sk);
1766 }
1767 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1768
1769 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1770 {
1771         u32 max_segs = 1;
1772
1773         sk_dst_set(sk, dst);
1774         sk->sk_route_caps = dst->dev->features;
1775         if (sk->sk_route_caps & NETIF_F_GSO)
1776                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1777         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1778         if (sk_can_gso(sk)) {
1779                 if (dst->header_len) {
1780                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1781                 } else {
1782                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1783                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1784                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1785                 }
1786         }
1787         sk->sk_gso_max_segs = max_segs;
1788 }
1789 EXPORT_SYMBOL_GPL(sk_setup_caps);
1790
1791 /*
1792  *      Simple resource managers for sockets.
1793  */
1794
1795
1796 /*
1797  * Write buffer destructor automatically called from kfree_skb.
1798  */
1799 void sock_wfree(struct sk_buff *skb)
1800 {
1801         struct sock *sk = skb->sk;
1802         unsigned int len = skb->truesize;
1803
1804         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1805                 /*
1806                  * Keep a reference on sk_wmem_alloc, this will be released
1807                  * after sk_write_space() call
1808                  */
1809                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1810                 sk->sk_write_space(sk);
1811                 len = 1;
1812         }
1813         /*
1814          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1815          * could not do because of in-flight packets
1816          */
1817         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1818                 __sk_free(sk);
1819 }
1820 EXPORT_SYMBOL(sock_wfree);
1821
1822 /* This variant of sock_wfree() is used by TCP,
1823  * since it sets SOCK_USE_WRITE_QUEUE.
1824  */
1825 void __sock_wfree(struct sk_buff *skb)
1826 {
1827         struct sock *sk = skb->sk;
1828
1829         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1830                 __sk_free(sk);
1831 }
1832
1833 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1834 {
1835         skb_orphan(skb);
1836         skb->sk = sk;
1837 #ifdef CONFIG_INET
1838         if (unlikely(!sk_fullsock(sk))) {
1839                 skb->destructor = sock_edemux;
1840                 sock_hold(sk);
1841                 return;
1842         }
1843 #endif
1844         skb->destructor = sock_wfree;
1845         skb_set_hash_from_sk(skb, sk);
1846         /*
1847          * We used to take a refcount on sk, but following operation
1848          * is enough to guarantee sk_free() wont free this sock until
1849          * all in-flight packets are completed
1850          */
1851         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1852 }
1853 EXPORT_SYMBOL(skb_set_owner_w);
1854
1855 /* This helper is used by netem, as it can hold packets in its
1856  * delay queue. We want to allow the owner socket to send more
1857  * packets, as if they were already TX completed by a typical driver.
1858  * But we also want to keep skb->sk set because some packet schedulers
1859  * rely on it (sch_fq for example).
1860  */
1861 void skb_orphan_partial(struct sk_buff *skb)
1862 {
1863         if (skb_is_tcp_pure_ack(skb))
1864                 return;
1865
1866         if (skb->destructor == sock_wfree
1867 #ifdef CONFIG_INET
1868             || skb->destructor == tcp_wfree
1869 #endif
1870                 ) {
1871                 struct sock *sk = skb->sk;
1872
1873                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1874                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1875                         skb->destructor = sock_efree;
1876                 }
1877         } else {
1878                 skb_orphan(skb);
1879         }
1880 }
1881 EXPORT_SYMBOL(skb_orphan_partial);
1882
1883 /*
1884  * Read buffer destructor automatically called from kfree_skb.
1885  */
1886 void sock_rfree(struct sk_buff *skb)
1887 {
1888         struct sock *sk = skb->sk;
1889         unsigned int len = skb->truesize;
1890
1891         atomic_sub(len, &sk->sk_rmem_alloc);
1892         sk_mem_uncharge(sk, len);
1893 }
1894 EXPORT_SYMBOL(sock_rfree);
1895
1896 /*
1897  * Buffer destructor for skbs that are not used directly in read or write
1898  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1899  */
1900 void sock_efree(struct sk_buff *skb)
1901 {
1902         sock_put(skb->sk);
1903 }
1904 EXPORT_SYMBOL(sock_efree);
1905
1906 kuid_t sock_i_uid(struct sock *sk)
1907 {
1908         kuid_t uid;
1909
1910         read_lock_bh(&sk->sk_callback_lock);
1911         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1912         read_unlock_bh(&sk->sk_callback_lock);
1913         return uid;
1914 }
1915 EXPORT_SYMBOL(sock_i_uid);
1916
1917 unsigned long sock_i_ino(struct sock *sk)
1918 {
1919         unsigned long ino;
1920
1921         read_lock_bh(&sk->sk_callback_lock);
1922         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1923         read_unlock_bh(&sk->sk_callback_lock);
1924         return ino;
1925 }
1926 EXPORT_SYMBOL(sock_i_ino);
1927
1928 /*
1929  * Allocate a skb from the socket's send buffer.
1930  */
1931 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1932                              gfp_t priority)
1933 {
1934         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1935                 struct sk_buff *skb = alloc_skb(size, priority);
1936                 if (skb) {
1937                         skb_set_owner_w(skb, sk);
1938                         return skb;
1939                 }
1940         }
1941         return NULL;
1942 }
1943 EXPORT_SYMBOL(sock_wmalloc);
1944
1945 static void sock_ofree(struct sk_buff *skb)
1946 {
1947         struct sock *sk = skb->sk;
1948
1949         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1950 }
1951
1952 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1953                              gfp_t priority)
1954 {
1955         struct sk_buff *skb;
1956
1957         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1958         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1959             sysctl_optmem_max)
1960                 return NULL;
1961
1962         skb = alloc_skb(size, priority);
1963         if (!skb)
1964                 return NULL;
1965
1966         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1967         skb->sk = sk;
1968         skb->destructor = sock_ofree;
1969         return skb;
1970 }
1971
1972 /*
1973  * Allocate a memory block from the socket's option memory buffer.
1974  */
1975 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1976 {
1977         if ((unsigned int)size <= sysctl_optmem_max &&
1978             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1979                 void *mem;
1980                 /* First do the add, to avoid the race if kmalloc
1981                  * might sleep.
1982                  */
1983                 atomic_add(size, &sk->sk_omem_alloc);
1984                 mem = kmalloc(size, priority);
1985                 if (mem)
1986                         return mem;
1987                 atomic_sub(size, &sk->sk_omem_alloc);
1988         }
1989         return NULL;
1990 }
1991 EXPORT_SYMBOL(sock_kmalloc);
1992
1993 /* Free an option memory block. Note, we actually want the inline
1994  * here as this allows gcc to detect the nullify and fold away the
1995  * condition entirely.
1996  */
1997 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1998                                   const bool nullify)
1999 {
2000         if (WARN_ON_ONCE(!mem))
2001                 return;
2002         if (nullify)
2003                 kzfree(mem);
2004         else
2005                 kfree(mem);
2006         atomic_sub(size, &sk->sk_omem_alloc);
2007 }
2008
2009 void sock_kfree_s(struct sock *sk, void *mem, int size)
2010 {
2011         __sock_kfree_s(sk, mem, size, false);
2012 }
2013 EXPORT_SYMBOL(sock_kfree_s);
2014
2015 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2016 {
2017         __sock_kfree_s(sk, mem, size, true);
2018 }
2019 EXPORT_SYMBOL(sock_kzfree_s);
2020
2021 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2022    I think, these locks should be removed for datagram sockets.
2023  */
2024 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2025 {
2026         DEFINE_WAIT(wait);
2027
2028         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2029         for (;;) {
2030                 if (!timeo)
2031                         break;
2032                 if (signal_pending(current))
2033                         break;
2034                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2035                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2036                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2037                         break;
2038                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2039                         break;
2040                 if (sk->sk_err)
2041                         break;
2042                 timeo = schedule_timeout(timeo);
2043         }
2044         finish_wait(sk_sleep(sk), &wait);
2045         return timeo;
2046 }
2047
2048
2049 /*
2050  *      Generic send/receive buffer handlers
2051  */
2052
2053 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2054                                      unsigned long data_len, int noblock,
2055                                      int *errcode, int max_page_order)
2056 {
2057         struct sk_buff *skb;
2058         long timeo;
2059         int err;
2060
2061         timeo = sock_sndtimeo(sk, noblock);
2062         for (;;) {
2063                 err = sock_error(sk);
2064                 if (err != 0)
2065                         goto failure;
2066
2067                 err = -EPIPE;
2068                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2069                         goto failure;
2070
2071                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2072                         break;
2073
2074                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2075                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2076                 err = -EAGAIN;
2077                 if (!timeo)
2078                         goto failure;
2079                 if (signal_pending(current))
2080                         goto interrupted;
2081                 timeo = sock_wait_for_wmem(sk, timeo);
2082         }
2083         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2084                                    errcode, sk->sk_allocation);
2085         if (skb)
2086                 skb_set_owner_w(skb, sk);
2087         return skb;
2088
2089 interrupted:
2090         err = sock_intr_errno(timeo);
2091 failure:
2092         *errcode = err;
2093         return NULL;
2094 }
2095 EXPORT_SYMBOL(sock_alloc_send_pskb);
2096
2097 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2098                                     int noblock, int *errcode)
2099 {
2100         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2101 }
2102 EXPORT_SYMBOL(sock_alloc_send_skb);
2103
2104 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2105                      struct sockcm_cookie *sockc)
2106 {
2107         u32 tsflags;
2108
2109         switch (cmsg->cmsg_type) {
2110         case SO_MARK:
2111                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2112                         return -EPERM;
2113                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2114                         return -EINVAL;
2115                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2116                 break;
2117         case SO_TIMESTAMPING:
2118                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2119                         return -EINVAL;
2120
2121                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2122                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2123                         return -EINVAL;
2124
2125                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2126                 sockc->tsflags |= tsflags;
2127                 break;
2128         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2129         case SCM_RIGHTS:
2130         case SCM_CREDENTIALS:
2131                 break;
2132         default:
2133                 return -EINVAL;
2134         }
2135         return 0;
2136 }
2137 EXPORT_SYMBOL(__sock_cmsg_send);
2138
2139 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2140                    struct sockcm_cookie *sockc)
2141 {
2142         struct cmsghdr *cmsg;
2143         int ret;
2144
2145         for_each_cmsghdr(cmsg, msg) {
2146                 if (!CMSG_OK(msg, cmsg))
2147                         return -EINVAL;
2148                 if (cmsg->cmsg_level != SOL_SOCKET)
2149                         continue;
2150                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2151                 if (ret)
2152                         return ret;
2153         }
2154         return 0;
2155 }
2156 EXPORT_SYMBOL(sock_cmsg_send);
2157
2158 static void sk_enter_memory_pressure(struct sock *sk)
2159 {
2160         if (!sk->sk_prot->enter_memory_pressure)
2161                 return;
2162
2163         sk->sk_prot->enter_memory_pressure(sk);
2164 }
2165
2166 static void sk_leave_memory_pressure(struct sock *sk)
2167 {
2168         if (sk->sk_prot->leave_memory_pressure) {
2169                 sk->sk_prot->leave_memory_pressure(sk);
2170         } else {
2171                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2172
2173                 if (memory_pressure && *memory_pressure)
2174                         *memory_pressure = 0;
2175         }
2176 }
2177
2178 /* On 32bit arches, an skb frag is limited to 2^15 */
2179 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2180
2181 /**
2182  * skb_page_frag_refill - check that a page_frag contains enough room
2183  * @sz: minimum size of the fragment we want to get
2184  * @pfrag: pointer to page_frag
2185  * @gfp: priority for memory allocation
2186  *
2187  * Note: While this allocator tries to use high order pages, there is
2188  * no guarantee that allocations succeed. Therefore, @sz MUST be
2189  * less or equal than PAGE_SIZE.
2190  */
2191 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2192 {
2193         if (pfrag->page) {
2194                 if (page_ref_count(pfrag->page) == 1) {
2195                         pfrag->offset = 0;
2196                         return true;
2197                 }
2198                 if (pfrag->offset + sz <= pfrag->size)
2199                         return true;
2200                 put_page(pfrag->page);
2201         }
2202
2203         pfrag->offset = 0;
2204         if (SKB_FRAG_PAGE_ORDER) {
2205                 /* Avoid direct reclaim but allow kswapd to wake */
2206                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2207                                           __GFP_COMP | __GFP_NOWARN |
2208                                           __GFP_NORETRY,
2209                                           SKB_FRAG_PAGE_ORDER);
2210                 if (likely(pfrag->page)) {
2211                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2212                         return true;
2213                 }
2214         }
2215         pfrag->page = alloc_page(gfp);
2216         if (likely(pfrag->page)) {
2217                 pfrag->size = PAGE_SIZE;
2218                 return true;
2219         }
2220         return false;
2221 }
2222 EXPORT_SYMBOL(skb_page_frag_refill);
2223
2224 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2225 {
2226         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2227                 return true;
2228
2229         sk_enter_memory_pressure(sk);
2230         sk_stream_moderate_sndbuf(sk);
2231         return false;
2232 }
2233 EXPORT_SYMBOL(sk_page_frag_refill);
2234
2235 static void __lock_sock(struct sock *sk)
2236         __releases(&sk->sk_lock.slock)
2237         __acquires(&sk->sk_lock.slock)
2238 {
2239         DEFINE_WAIT(wait);
2240
2241         for (;;) {
2242                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2243                                         TASK_UNINTERRUPTIBLE);
2244                 spin_unlock_bh(&sk->sk_lock.slock);
2245                 schedule();
2246                 spin_lock_bh(&sk->sk_lock.slock);
2247                 if (!sock_owned_by_user(sk))
2248                         break;
2249         }
2250         finish_wait(&sk->sk_lock.wq, &wait);
2251 }
2252
2253 static void __release_sock(struct sock *sk)
2254         __releases(&sk->sk_lock.slock)
2255         __acquires(&sk->sk_lock.slock)
2256 {
2257         struct sk_buff *skb, *next;
2258
2259         while ((skb = sk->sk_backlog.head) != NULL) {
2260                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2261
2262                 spin_unlock_bh(&sk->sk_lock.slock);
2263
2264                 do {
2265                         next = skb->next;
2266                         prefetch(next);
2267                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2268                         skb->next = NULL;
2269                         sk_backlog_rcv(sk, skb);
2270
2271                         cond_resched();
2272
2273                         skb = next;
2274                 } while (skb != NULL);
2275
2276                 spin_lock_bh(&sk->sk_lock.slock);
2277         }
2278
2279         /*
2280          * Doing the zeroing here guarantee we can not loop forever
2281          * while a wild producer attempts to flood us.
2282          */
2283         sk->sk_backlog.len = 0;
2284 }
2285
2286 void __sk_flush_backlog(struct sock *sk)
2287 {
2288         spin_lock_bh(&sk->sk_lock.slock);
2289         __release_sock(sk);
2290         spin_unlock_bh(&sk->sk_lock.slock);
2291 }
2292
2293 /**
2294  * sk_wait_data - wait for data to arrive at sk_receive_queue
2295  * @sk:    sock to wait on
2296  * @timeo: for how long
2297  * @skb:   last skb seen on sk_receive_queue
2298  *
2299  * Now socket state including sk->sk_err is changed only under lock,
2300  * hence we may omit checks after joining wait queue.
2301  * We check receive queue before schedule() only as optimization;
2302  * it is very likely that release_sock() added new data.
2303  */
2304 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2305 {
2306         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2307         int rc;
2308
2309         add_wait_queue(sk_sleep(sk), &wait);
2310         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2311         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2312         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2313         remove_wait_queue(sk_sleep(sk), &wait);
2314         return rc;
2315 }
2316 EXPORT_SYMBOL(sk_wait_data);
2317
2318 /**
2319  *      __sk_mem_raise_allocated - increase memory_allocated
2320  *      @sk: socket
2321  *      @size: memory size to allocate
2322  *      @amt: pages to allocate
2323  *      @kind: allocation type
2324  *
2325  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2326  */
2327 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2328 {
2329         struct proto *prot = sk->sk_prot;
2330         long allocated = sk_memory_allocated_add(sk, amt);
2331
2332         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2333             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2334                 goto suppress_allocation;
2335
2336         /* Under limit. */
2337         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2338                 sk_leave_memory_pressure(sk);
2339                 return 1;
2340         }
2341
2342         /* Under pressure. */
2343         if (allocated > sk_prot_mem_limits(sk, 1))
2344                 sk_enter_memory_pressure(sk);
2345
2346         /* Over hard limit. */
2347         if (allocated > sk_prot_mem_limits(sk, 2))
2348                 goto suppress_allocation;
2349
2350         /* guarantee minimum buffer size under pressure */
2351         if (kind == SK_MEM_RECV) {
2352                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2353                         return 1;
2354
2355         } else { /* SK_MEM_SEND */
2356                 if (sk->sk_type == SOCK_STREAM) {
2357                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2358                                 return 1;
2359                 } else if (refcount_read(&sk->sk_wmem_alloc) <
2360                            prot->sysctl_wmem[0])
2361                                 return 1;
2362         }
2363
2364         if (sk_has_memory_pressure(sk)) {
2365                 int alloc;
2366
2367                 if (!sk_under_memory_pressure(sk))
2368                         return 1;
2369                 alloc = sk_sockets_allocated_read_positive(sk);
2370                 if (sk_prot_mem_limits(sk, 2) > alloc *
2371                     sk_mem_pages(sk->sk_wmem_queued +
2372                                  atomic_read(&sk->sk_rmem_alloc) +
2373                                  sk->sk_forward_alloc))
2374                         return 1;
2375         }
2376
2377 suppress_allocation:
2378
2379         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2380                 sk_stream_moderate_sndbuf(sk);
2381
2382                 /* Fail only if socket is _under_ its sndbuf.
2383                  * In this case we cannot block, so that we have to fail.
2384                  */
2385                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2386                         return 1;
2387         }
2388
2389         trace_sock_exceed_buf_limit(sk, prot, allocated);
2390
2391         sk_memory_allocated_sub(sk, amt);
2392
2393         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2394                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2395
2396         return 0;
2397 }
2398 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2399
2400 /**
2401  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2402  *      @sk: socket
2403  *      @size: memory size to allocate
2404  *      @kind: allocation type
2405  *
2406  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2407  *      rmem allocation. This function assumes that protocols which have
2408  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2409  */
2410 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2411 {
2412         int ret, amt = sk_mem_pages(size);
2413
2414         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2415         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2416         if (!ret)
2417                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2418         return ret;
2419 }
2420 EXPORT_SYMBOL(__sk_mem_schedule);
2421
2422 /**
2423  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2424  *      @sk: socket
2425  *      @amount: number of quanta
2426  *
2427  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2428  */
2429 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2430 {
2431         sk_memory_allocated_sub(sk, amount);
2432
2433         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2434                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2435
2436         if (sk_under_memory_pressure(sk) &&
2437             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2438                 sk_leave_memory_pressure(sk);
2439 }
2440 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2441
2442 /**
2443  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2444  *      @sk: socket
2445  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2446  */
2447 void __sk_mem_reclaim(struct sock *sk, int amount)
2448 {
2449         amount >>= SK_MEM_QUANTUM_SHIFT;
2450         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2451         __sk_mem_reduce_allocated(sk, amount);
2452 }
2453 EXPORT_SYMBOL(__sk_mem_reclaim);
2454
2455 int sk_set_peek_off(struct sock *sk, int val)
2456 {
2457         if (val < 0)
2458                 return -EINVAL;
2459
2460         sk->sk_peek_off = val;
2461         return 0;
2462 }
2463 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2464
2465 /*
2466  * Set of default routines for initialising struct proto_ops when
2467  * the protocol does not support a particular function. In certain
2468  * cases where it makes no sense for a protocol to have a "do nothing"
2469  * function, some default processing is provided.
2470  */
2471
2472 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2473 {
2474         return -EOPNOTSUPP;
2475 }
2476 EXPORT_SYMBOL(sock_no_bind);
2477
2478 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2479                     int len, int flags)
2480 {
2481         return -EOPNOTSUPP;
2482 }
2483 EXPORT_SYMBOL(sock_no_connect);
2484
2485 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2486 {
2487         return -EOPNOTSUPP;
2488 }
2489 EXPORT_SYMBOL(sock_no_socketpair);
2490
2491 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2492                    bool kern)
2493 {
2494         return -EOPNOTSUPP;
2495 }
2496 EXPORT_SYMBOL(sock_no_accept);
2497
2498 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2499                     int *len, int peer)
2500 {
2501         return -EOPNOTSUPP;
2502 }
2503 EXPORT_SYMBOL(sock_no_getname);
2504
2505 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2506 {
2507         return 0;
2508 }
2509 EXPORT_SYMBOL(sock_no_poll);
2510
2511 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2512 {
2513         return -EOPNOTSUPP;
2514 }
2515 EXPORT_SYMBOL(sock_no_ioctl);
2516
2517 int sock_no_listen(struct socket *sock, int backlog)
2518 {
2519         return -EOPNOTSUPP;
2520 }
2521 EXPORT_SYMBOL(sock_no_listen);
2522
2523 int sock_no_shutdown(struct socket *sock, int how)
2524 {
2525         return -EOPNOTSUPP;
2526 }
2527 EXPORT_SYMBOL(sock_no_shutdown);
2528
2529 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2530                     char __user *optval, unsigned int optlen)
2531 {
2532         return -EOPNOTSUPP;
2533 }
2534 EXPORT_SYMBOL(sock_no_setsockopt);
2535
2536 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2537                     char __user *optval, int __user *optlen)
2538 {
2539         return -EOPNOTSUPP;
2540 }
2541 EXPORT_SYMBOL(sock_no_getsockopt);
2542
2543 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2544 {
2545         return -EOPNOTSUPP;
2546 }
2547 EXPORT_SYMBOL(sock_no_sendmsg);
2548
2549 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2550 {
2551         return -EOPNOTSUPP;
2552 }
2553 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2554
2555 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2556                     int flags)
2557 {
2558         return -EOPNOTSUPP;
2559 }
2560 EXPORT_SYMBOL(sock_no_recvmsg);
2561
2562 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2563 {
2564         /* Mirror missing mmap method error code */
2565         return -ENODEV;
2566 }
2567 EXPORT_SYMBOL(sock_no_mmap);
2568
2569 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2570 {
2571         ssize_t res;
2572         struct msghdr msg = {.msg_flags = flags};
2573         struct kvec iov;
2574         char *kaddr = kmap(page);
2575         iov.iov_base = kaddr + offset;
2576         iov.iov_len = size;
2577         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2578         kunmap(page);
2579         return res;
2580 }
2581 EXPORT_SYMBOL(sock_no_sendpage);
2582
2583 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2584                                 int offset, size_t size, int flags)
2585 {
2586         ssize_t res;
2587         struct msghdr msg = {.msg_flags = flags};
2588         struct kvec iov;
2589         char *kaddr = kmap(page);
2590
2591         iov.iov_base = kaddr + offset;
2592         iov.iov_len = size;
2593         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2594         kunmap(page);
2595         return res;
2596 }
2597 EXPORT_SYMBOL(sock_no_sendpage_locked);
2598
2599 /*
2600  *      Default Socket Callbacks
2601  */
2602
2603 static void sock_def_wakeup(struct sock *sk)
2604 {
2605         struct socket_wq *wq;
2606
2607         rcu_read_lock();
2608         wq = rcu_dereference(sk->sk_wq);
2609         if (skwq_has_sleeper(wq))
2610                 wake_up_interruptible_all(&wq->wait);
2611         rcu_read_unlock();
2612 }
2613
2614 static void sock_def_error_report(struct sock *sk)
2615 {
2616         struct socket_wq *wq;
2617
2618         rcu_read_lock();
2619         wq = rcu_dereference(sk->sk_wq);
2620         if (skwq_has_sleeper(wq))
2621                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2622         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2623         rcu_read_unlock();
2624 }
2625
2626 static void sock_def_readable(struct sock *sk)
2627 {
2628         struct socket_wq *wq;
2629
2630         rcu_read_lock();
2631         wq = rcu_dereference(sk->sk_wq);
2632         if (skwq_has_sleeper(wq))
2633                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2634                                                 POLLRDNORM | POLLRDBAND);
2635         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2636         rcu_read_unlock();
2637 }
2638
2639 static void sock_def_write_space(struct sock *sk)
2640 {
2641         struct socket_wq *wq;
2642
2643         rcu_read_lock();
2644
2645         /* Do not wake up a writer until he can make "significant"
2646          * progress.  --DaveM
2647          */
2648         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2649                 wq = rcu_dereference(sk->sk_wq);
2650                 if (skwq_has_sleeper(wq))
2651                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2652                                                 POLLWRNORM | POLLWRBAND);
2653
2654                 /* Should agree with poll, otherwise some programs break */
2655                 if (sock_writeable(sk))
2656                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2657         }
2658
2659         rcu_read_unlock();
2660 }
2661
2662 static void sock_def_destruct(struct sock *sk)
2663 {
2664 }
2665
2666 void sk_send_sigurg(struct sock *sk)
2667 {
2668         if (sk->sk_socket && sk->sk_socket->file)
2669                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2670                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2671 }
2672 EXPORT_SYMBOL(sk_send_sigurg);
2673
2674 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2675                     unsigned long expires)
2676 {
2677         if (!mod_timer(timer, expires))
2678                 sock_hold(sk);
2679 }
2680 EXPORT_SYMBOL(sk_reset_timer);
2681
2682 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2683 {
2684         if (del_timer(timer))
2685                 __sock_put(sk);
2686 }
2687 EXPORT_SYMBOL(sk_stop_timer);
2688
2689 void sock_init_data(struct socket *sock, struct sock *sk)
2690 {
2691         sk_init_common(sk);
2692         sk->sk_send_head        =       NULL;
2693
2694         init_timer(&sk->sk_timer);
2695
2696         sk->sk_allocation       =       GFP_KERNEL;
2697         sk->sk_rcvbuf           =       sysctl_rmem_default;
2698         sk->sk_sndbuf           =       sysctl_wmem_default;
2699         sk->sk_state            =       TCP_CLOSE;
2700         sk_set_socket(sk, sock);
2701
2702         sock_set_flag(sk, SOCK_ZAPPED);
2703
2704         if (sock) {
2705                 sk->sk_type     =       sock->type;
2706                 sk->sk_wq       =       sock->wq;
2707                 sock->sk        =       sk;
2708                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2709         } else {
2710                 sk->sk_wq       =       NULL;
2711                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2712         }
2713
2714         rwlock_init(&sk->sk_callback_lock);
2715         if (sk->sk_kern_sock)
2716                 lockdep_set_class_and_name(
2717                         &sk->sk_callback_lock,
2718                         af_kern_callback_keys + sk->sk_family,
2719                         af_family_kern_clock_key_strings[sk->sk_family]);
2720         else
2721                 lockdep_set_class_and_name(
2722                         &sk->sk_callback_lock,
2723                         af_callback_keys + sk->sk_family,
2724                         af_family_clock_key_strings[sk->sk_family]);
2725
2726         sk->sk_state_change     =       sock_def_wakeup;
2727         sk->sk_data_ready       =       sock_def_readable;
2728         sk->sk_write_space      =       sock_def_write_space;
2729         sk->sk_error_report     =       sock_def_error_report;
2730         sk->sk_destruct         =       sock_def_destruct;
2731
2732         sk->sk_frag.page        =       NULL;
2733         sk->sk_frag.offset      =       0;
2734         sk->sk_peek_off         =       -1;
2735
2736         sk->sk_peer_pid         =       NULL;
2737         sk->sk_peer_cred        =       NULL;
2738         sk->sk_write_pending    =       0;
2739         sk->sk_rcvlowat         =       1;
2740         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2741         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2742
2743         sk->sk_stamp = SK_DEFAULT_STAMP;
2744         atomic_set(&sk->sk_zckey, 0);
2745
2746 #ifdef CONFIG_NET_RX_BUSY_POLL
2747         sk->sk_napi_id          =       0;
2748         sk->sk_ll_usec          =       sysctl_net_busy_read;
2749 #endif
2750
2751         sk->sk_max_pacing_rate = ~0U;
2752         sk->sk_pacing_rate = ~0U;
2753         sk->sk_incoming_cpu = -1;
2754         /*
2755          * Before updating sk_refcnt, we must commit prior changes to memory
2756          * (Documentation/RCU/rculist_nulls.txt for details)
2757          */
2758         smp_wmb();
2759         refcount_set(&sk->sk_refcnt, 1);
2760         atomic_set(&sk->sk_drops, 0);
2761 }
2762 EXPORT_SYMBOL(sock_init_data);
2763
2764 void lock_sock_nested(struct sock *sk, int subclass)
2765 {
2766         might_sleep();
2767         spin_lock_bh(&sk->sk_lock.slock);
2768         if (sk->sk_lock.owned)
2769                 __lock_sock(sk);
2770         sk->sk_lock.owned = 1;
2771         spin_unlock(&sk->sk_lock.slock);
2772         /*
2773          * The sk_lock has mutex_lock() semantics here:
2774          */
2775         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2776         local_bh_enable();
2777 }
2778 EXPORT_SYMBOL(lock_sock_nested);
2779
2780 void release_sock(struct sock *sk)
2781 {
2782         spin_lock_bh(&sk->sk_lock.slock);
2783         if (sk->sk_backlog.tail)
2784                 __release_sock(sk);
2785
2786         /* Warning : release_cb() might need to release sk ownership,
2787          * ie call sock_release_ownership(sk) before us.
2788          */
2789         if (sk->sk_prot->release_cb)
2790                 sk->sk_prot->release_cb(sk);
2791
2792         sock_release_ownership(sk);
2793         if (waitqueue_active(&sk->sk_lock.wq))
2794                 wake_up(&sk->sk_lock.wq);
2795         spin_unlock_bh(&sk->sk_lock.slock);
2796 }
2797 EXPORT_SYMBOL(release_sock);
2798
2799 /**
2800  * lock_sock_fast - fast version of lock_sock
2801  * @sk: socket
2802  *
2803  * This version should be used for very small section, where process wont block
2804  * return false if fast path is taken:
2805  *
2806  *   sk_lock.slock locked, owned = 0, BH disabled
2807  *
2808  * return true if slow path is taken:
2809  *
2810  *   sk_lock.slock unlocked, owned = 1, BH enabled
2811  */
2812 bool lock_sock_fast(struct sock *sk)
2813 {
2814         might_sleep();
2815         spin_lock_bh(&sk->sk_lock.slock);
2816
2817         if (!sk->sk_lock.owned)
2818                 /*
2819                  * Note : We must disable BH
2820                  */
2821                 return false;
2822
2823         __lock_sock(sk);
2824         sk->sk_lock.owned = 1;
2825         spin_unlock(&sk->sk_lock.slock);
2826         /*
2827          * The sk_lock has mutex_lock() semantics here:
2828          */
2829         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2830         local_bh_enable();
2831         return true;
2832 }
2833 EXPORT_SYMBOL(lock_sock_fast);
2834
2835 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2836 {
2837         struct timeval tv;
2838         if (!sock_flag(sk, SOCK_TIMESTAMP))
2839                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2840         tv = ktime_to_timeval(sk->sk_stamp);
2841         if (tv.tv_sec == -1)
2842                 return -ENOENT;
2843         if (tv.tv_sec == 0) {
2844                 sk->sk_stamp = ktime_get_real();
2845                 tv = ktime_to_timeval(sk->sk_stamp);
2846         }
2847         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2848 }
2849 EXPORT_SYMBOL(sock_get_timestamp);
2850
2851 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2852 {
2853         struct timespec ts;
2854         if (!sock_flag(sk, SOCK_TIMESTAMP))
2855                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2856         ts = ktime_to_timespec(sk->sk_stamp);
2857         if (ts.tv_sec == -1)
2858                 return -ENOENT;
2859         if (ts.tv_sec == 0) {
2860                 sk->sk_stamp = ktime_get_real();
2861                 ts = ktime_to_timespec(sk->sk_stamp);
2862         }
2863         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2864 }
2865 EXPORT_SYMBOL(sock_get_timestampns);
2866
2867 void sock_enable_timestamp(struct sock *sk, int flag)
2868 {
2869         if (!sock_flag(sk, flag)) {
2870                 unsigned long previous_flags = sk->sk_flags;
2871
2872                 sock_set_flag(sk, flag);
2873                 /*
2874                  * we just set one of the two flags which require net
2875                  * time stamping, but time stamping might have been on
2876                  * already because of the other one
2877                  */
2878                 if (sock_needs_netstamp(sk) &&
2879                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2880                         net_enable_timestamp();
2881         }
2882 }
2883
2884 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2885                        int level, int type)
2886 {
2887         struct sock_exterr_skb *serr;
2888         struct sk_buff *skb;
2889         int copied, err;
2890
2891         err = -EAGAIN;
2892         skb = sock_dequeue_err_skb(sk);
2893         if (skb == NULL)
2894                 goto out;
2895
2896         copied = skb->len;
2897         if (copied > len) {
2898                 msg->msg_flags |= MSG_TRUNC;
2899                 copied = len;
2900         }
2901         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2902         if (err)
2903                 goto out_free_skb;
2904
2905         sock_recv_timestamp(msg, sk, skb);
2906
2907         serr = SKB_EXT_ERR(skb);
2908         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2909
2910         msg->msg_flags |= MSG_ERRQUEUE;
2911         err = copied;
2912
2913 out_free_skb:
2914         kfree_skb(skb);
2915 out:
2916         return err;
2917 }
2918 EXPORT_SYMBOL(sock_recv_errqueue);
2919
2920 /*
2921  *      Get a socket option on an socket.
2922  *
2923  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2924  *      asynchronous errors should be reported by getsockopt. We assume
2925  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2926  */
2927 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2928                            char __user *optval, int __user *optlen)
2929 {
2930         struct sock *sk = sock->sk;
2931
2932         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2933 }
2934 EXPORT_SYMBOL(sock_common_getsockopt);
2935
2936 #ifdef CONFIG_COMPAT
2937 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2938                                   char __user *optval, int __user *optlen)
2939 {
2940         struct sock *sk = sock->sk;
2941
2942         if (sk->sk_prot->compat_getsockopt != NULL)
2943                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2944                                                       optval, optlen);
2945         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2946 }
2947 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2948 #endif
2949
2950 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2951                         int flags)
2952 {
2953         struct sock *sk = sock->sk;
2954         int addr_len = 0;
2955         int err;
2956
2957         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2958                                    flags & ~MSG_DONTWAIT, &addr_len);
2959         if (err >= 0)
2960                 msg->msg_namelen = addr_len;
2961         return err;
2962 }
2963 EXPORT_SYMBOL(sock_common_recvmsg);
2964
2965 /*
2966  *      Set socket options on an inet socket.
2967  */
2968 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2969                            char __user *optval, unsigned int optlen)
2970 {
2971         struct sock *sk = sock->sk;
2972
2973         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2974 }
2975 EXPORT_SYMBOL(sock_common_setsockopt);
2976
2977 #ifdef CONFIG_COMPAT
2978 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2979                                   char __user *optval, unsigned int optlen)
2980 {
2981         struct sock *sk = sock->sk;
2982
2983         if (sk->sk_prot->compat_setsockopt != NULL)
2984                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2985                                                       optval, optlen);
2986         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2987 }
2988 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2989 #endif
2990
2991 void sk_common_release(struct sock *sk)
2992 {
2993         if (sk->sk_prot->destroy)
2994                 sk->sk_prot->destroy(sk);
2995
2996         /*
2997          * Observation: when sock_common_release is called, processes have
2998          * no access to socket. But net still has.
2999          * Step one, detach it from networking:
3000          *
3001          * A. Remove from hash tables.
3002          */
3003
3004         sk->sk_prot->unhash(sk);
3005
3006         /*
3007          * In this point socket cannot receive new packets, but it is possible
3008          * that some packets are in flight because some CPU runs receiver and
3009          * did hash table lookup before we unhashed socket. They will achieve
3010          * receive queue and will be purged by socket destructor.
3011          *
3012          * Also we still have packets pending on receive queue and probably,
3013          * our own packets waiting in device queues. sock_destroy will drain
3014          * receive queue, but transmitted packets will delay socket destruction
3015          * until the last reference will be released.
3016          */
3017
3018         sock_orphan(sk);
3019
3020         xfrm_sk_free_policy(sk);
3021
3022         sk_refcnt_debug_release(sk);
3023
3024         sock_put(sk);
3025 }
3026 EXPORT_SYMBOL(sk_common_release);
3027
3028 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3029 {
3030         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3031
3032         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3033         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3034         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3035         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3036         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3037         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3038         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3039         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3040         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3041 }
3042
3043 #ifdef CONFIG_PROC_FS
3044 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3045 struct prot_inuse {
3046         int val[PROTO_INUSE_NR];
3047 };
3048
3049 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3050
3051 #ifdef CONFIG_NET_NS
3052 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3053 {
3054         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
3055 }
3056 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3057
3058 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3059 {
3060         int cpu, idx = prot->inuse_idx;
3061         int res = 0;
3062
3063         for_each_possible_cpu(cpu)
3064                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3065
3066         return res >= 0 ? res : 0;
3067 }
3068 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3069
3070 static int __net_init sock_inuse_init_net(struct net *net)
3071 {
3072         net->core.inuse = alloc_percpu(struct prot_inuse);
3073         return net->core.inuse ? 0 : -ENOMEM;
3074 }
3075
3076 static void __net_exit sock_inuse_exit_net(struct net *net)
3077 {
3078         free_percpu(net->core.inuse);
3079 }
3080
3081 static struct pernet_operations net_inuse_ops = {
3082         .init = sock_inuse_init_net,
3083         .exit = sock_inuse_exit_net,
3084 };
3085
3086 static __init int net_inuse_init(void)
3087 {
3088         if (register_pernet_subsys(&net_inuse_ops))
3089                 panic("Cannot initialize net inuse counters");
3090
3091         return 0;
3092 }
3093
3094 core_initcall(net_inuse_init);
3095 #else
3096 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3097
3098 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3099 {
3100         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3101 }
3102 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3103
3104 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3105 {
3106         int cpu, idx = prot->inuse_idx;
3107         int res = 0;
3108
3109         for_each_possible_cpu(cpu)
3110                 res += per_cpu(prot_inuse, cpu).val[idx];
3111
3112         return res >= 0 ? res : 0;
3113 }
3114 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3115 #endif
3116
3117 static void assign_proto_idx(struct proto *prot)
3118 {
3119         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3120
3121         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3122                 pr_err("PROTO_INUSE_NR exhausted\n");
3123                 return;
3124         }
3125
3126         set_bit(prot->inuse_idx, proto_inuse_idx);
3127 }
3128
3129 static void release_proto_idx(struct proto *prot)
3130 {
3131         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3132                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3133 }
3134 #else
3135 static inline void assign_proto_idx(struct proto *prot)
3136 {
3137 }
3138
3139 static inline void release_proto_idx(struct proto *prot)
3140 {
3141 }
3142 #endif
3143
3144 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3145 {
3146         if (!rsk_prot)
3147                 return;
3148         kfree(rsk_prot->slab_name);
3149         rsk_prot->slab_name = NULL;
3150         kmem_cache_destroy(rsk_prot->slab);
3151         rsk_prot->slab = NULL;
3152 }
3153
3154 static int req_prot_init(const struct proto *prot)
3155 {
3156         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3157
3158         if (!rsk_prot)
3159                 return 0;
3160
3161         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3162                                         prot->name);
3163         if (!rsk_prot->slab_name)
3164                 return -ENOMEM;
3165
3166         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3167                                            rsk_prot->obj_size, 0,
3168                                            prot->slab_flags, NULL);
3169
3170         if (!rsk_prot->slab) {
3171                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3172                         prot->name);
3173                 return -ENOMEM;
3174         }
3175         return 0;
3176 }
3177
3178 int proto_register(struct proto *prot, int alloc_slab)
3179 {
3180         if (alloc_slab) {
3181                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3182                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3183                                         NULL);
3184
3185                 if (prot->slab == NULL) {
3186                         pr_crit("%s: Can't create sock SLAB cache!\n",
3187                                 prot->name);
3188                         goto out;
3189                 }
3190
3191                 if (req_prot_init(prot))
3192                         goto out_free_request_sock_slab;
3193
3194                 if (prot->twsk_prot != NULL) {
3195                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3196
3197                         if (prot->twsk_prot->twsk_slab_name == NULL)
3198                                 goto out_free_request_sock_slab;
3199
3200                         prot->twsk_prot->twsk_slab =
3201                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3202                                                   prot->twsk_prot->twsk_obj_size,
3203                                                   0,
3204                                                   prot->slab_flags,
3205                                                   NULL);
3206                         if (prot->twsk_prot->twsk_slab == NULL)
3207                                 goto out_free_timewait_sock_slab_name;
3208                 }
3209         }
3210
3211         mutex_lock(&proto_list_mutex);
3212         list_add(&prot->node, &proto_list);
3213         assign_proto_idx(prot);
3214         mutex_unlock(&proto_list_mutex);
3215         return 0;
3216
3217 out_free_timewait_sock_slab_name:
3218         kfree(prot->twsk_prot->twsk_slab_name);
3219 out_free_request_sock_slab:
3220         req_prot_cleanup(prot->rsk_prot);
3221
3222         kmem_cache_destroy(prot->slab);
3223         prot->slab = NULL;
3224 out:
3225         return -ENOBUFS;
3226 }
3227 EXPORT_SYMBOL(proto_register);
3228
3229 void proto_unregister(struct proto *prot)
3230 {
3231         mutex_lock(&proto_list_mutex);
3232         release_proto_idx(prot);
3233         list_del(&prot->node);
3234         mutex_unlock(&proto_list_mutex);
3235
3236         kmem_cache_destroy(prot->slab);
3237         prot->slab = NULL;
3238
3239         req_prot_cleanup(prot->rsk_prot);
3240
3241         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3242                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3243                 kfree(prot->twsk_prot->twsk_slab_name);
3244                 prot->twsk_prot->twsk_slab = NULL;
3245         }
3246 }
3247 EXPORT_SYMBOL(proto_unregister);
3248
3249 #ifdef CONFIG_PROC_FS
3250 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3251         __acquires(proto_list_mutex)
3252 {
3253         mutex_lock(&proto_list_mutex);
3254         return seq_list_start_head(&proto_list, *pos);
3255 }
3256
3257 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3258 {
3259         return seq_list_next(v, &proto_list, pos);
3260 }
3261
3262 static void proto_seq_stop(struct seq_file *seq, void *v)
3263         __releases(proto_list_mutex)
3264 {
3265         mutex_unlock(&proto_list_mutex);
3266 }
3267
3268 static char proto_method_implemented(const void *method)
3269 {
3270         return method == NULL ? 'n' : 'y';
3271 }
3272 static long sock_prot_memory_allocated(struct proto *proto)
3273 {
3274         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3275 }
3276
3277 static char *sock_prot_memory_pressure(struct proto *proto)
3278 {
3279         return proto->memory_pressure != NULL ?
3280         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3281 }
3282
3283 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3284 {
3285
3286         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3287                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3288                    proto->name,
3289                    proto->obj_size,
3290                    sock_prot_inuse_get(seq_file_net(seq), proto),
3291                    sock_prot_memory_allocated(proto),
3292                    sock_prot_memory_pressure(proto),
3293                    proto->max_header,
3294                    proto->slab == NULL ? "no" : "yes",
3295                    module_name(proto->owner),
3296                    proto_method_implemented(proto->close),
3297                    proto_method_implemented(proto->connect),
3298                    proto_method_implemented(proto->disconnect),
3299                    proto_method_implemented(proto->accept),
3300                    proto_method_implemented(proto->ioctl),
3301                    proto_method_implemented(proto->init),
3302                    proto_method_implemented(proto->destroy),
3303                    proto_method_implemented(proto->shutdown),
3304                    proto_method_implemented(proto->setsockopt),
3305                    proto_method_implemented(proto->getsockopt),
3306                    proto_method_implemented(proto->sendmsg),
3307                    proto_method_implemented(proto->recvmsg),
3308                    proto_method_implemented(proto->sendpage),
3309                    proto_method_implemented(proto->bind),
3310                    proto_method_implemented(proto->backlog_rcv),
3311                    proto_method_implemented(proto->hash),
3312                    proto_method_implemented(proto->unhash),
3313                    proto_method_implemented(proto->get_port),
3314                    proto_method_implemented(proto->enter_memory_pressure));
3315 }
3316
3317 static int proto_seq_show(struct seq_file *seq, void *v)
3318 {
3319         if (v == &proto_list)
3320                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3321                            "protocol",
3322                            "size",
3323                            "sockets",
3324                            "memory",
3325                            "press",
3326                            "maxhdr",
3327                            "slab",
3328                            "module",
3329                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3330         else
3331                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3332         return 0;
3333 }
3334
3335 static const struct seq_operations proto_seq_ops = {
3336         .start  = proto_seq_start,
3337         .next   = proto_seq_next,
3338         .stop   = proto_seq_stop,
3339         .show   = proto_seq_show,
3340 };
3341
3342 static int proto_seq_open(struct inode *inode, struct file *file)
3343 {
3344         return seq_open_net(inode, file, &proto_seq_ops,
3345                             sizeof(struct seq_net_private));
3346 }
3347
3348 static const struct file_operations proto_seq_fops = {
3349         .owner          = THIS_MODULE,
3350         .open           = proto_seq_open,
3351         .read           = seq_read,
3352         .llseek         = seq_lseek,
3353         .release        = seq_release_net,
3354 };
3355
3356 static __net_init int proto_init_net(struct net *net)
3357 {
3358         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3359                 return -ENOMEM;
3360
3361         return 0;
3362 }
3363
3364 static __net_exit void proto_exit_net(struct net *net)
3365 {
3366         remove_proc_entry("protocols", net->proc_net);
3367 }
3368
3369
3370 static __net_initdata struct pernet_operations proto_net_ops = {
3371         .init = proto_init_net,
3372         .exit = proto_exit_net,
3373 };
3374
3375 static int __init proto_init(void)
3376 {
3377         return register_pernet_subsys(&proto_net_ops);
3378 }
3379
3380 subsys_initcall(proto_init);
3381
3382 #endif /* PROC_FS */
3383
3384 #ifdef CONFIG_NET_RX_BUSY_POLL
3385 bool sk_busy_loop_end(void *p, unsigned long start_time)
3386 {
3387         struct sock *sk = p;
3388
3389         return !skb_queue_empty(&sk->sk_receive_queue) ||
3390                sk_busy_loop_timeout(sk, start_time);
3391 }
3392 EXPORT_SYMBOL(sk_busy_loop_end);
3393 #endif /* CONFIG_NET_RX_BUSY_POLL */