net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 static void sock_inuse_add(struct net *net, int val);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family and separate keys for internal and
 201  * userspace sockets.
 202  */
 203 static struct lock_class_key af_family_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_keys[AF_MAX];
 205 static struct lock_class_key af_family_slock_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 207
 208 /*
 209  * Make lock validator output more readable. (we pre-construct these
 210  * strings build-time, so that runtime initialization of socket
 211  * locks is fast):
 212  */
 213
 214 #define _sock_locks(x)                                            \
 215   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 216   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 217   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 218   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 219   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 220   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 221   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 222   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 223   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 224   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 225   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 226   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 227   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 228   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 229   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 230
 231 static const char *const af_family_key_strings[AF_MAX+1] = {
 232         _sock_locks("sk_lock-")
 233 };
 234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 235         _sock_locks("slock-")
 236 };
 237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 238         _sock_locks("clock-")
 239 };
 240
 241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 242         _sock_locks("k-sk_lock-")
 243 };
 244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 245         _sock_locks("k-slock-")
 246 };
 247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 248         _sock_locks("k-clock-")
 249 };
 250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 251   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 252   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 253   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 254   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 255   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 256   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 257   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 258   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 259   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 260   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 261   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 262   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 263   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 264   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 265   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 266 };
 267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 268   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 269   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 270   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 271   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 272   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 273   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 274   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 275   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 276   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 277   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 278   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 279   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 280   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 281   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 282   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 283 };
 284 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 285   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 286   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 287   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 288   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 289   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 290   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 291   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 292   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 293   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 294   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 295   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 296   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 297   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 298   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 299   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 300 };
 301
 302 /*
 303  * sk_callback_lock and sk queues locking rules are per-address-family,
 304  * so split the lock classes by using a per-AF key:
 305  */
 306 static struct lock_class_key af_callback_keys[AF_MAX];
 307 static struct lock_class_key af_rlock_keys[AF_MAX];
 308 static struct lock_class_key af_wlock_keys[AF_MAX];
 309 static struct lock_class_key af_elock_keys[AF_MAX];
 310 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 311
 312 /* Run time adjustable parameters. */
 313 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 314 EXPORT_SYMBOL(sysctl_wmem_max);
 315 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 316 EXPORT_SYMBOL(sysctl_rmem_max);
 317 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 318 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 319
 320 /* Maximal space eaten by iovec or ancillary data plus some space */
 321 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 322 EXPORT_SYMBOL(sysctl_optmem_max);
 323
 324 int sysctl_tstamp_allow_data __read_mostly = 1;
 325
 326 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 327 EXPORT_SYMBOL_GPL(memalloc_socks);
 328
 329 /**
 330  * sk_set_memalloc - sets %SOCK_MEMALLOC
 331  * @sk: socket to set it on
 332  *
 333  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 334  * It's the responsibility of the admin to adjust min_free_kbytes
 335  * to meet the requirements
 336  */
 337 void sk_set_memalloc(struct sock *sk)
 338 {
 339         sock_set_flag(sk, SOCK_MEMALLOC);
 340         sk->sk_allocation |= __GFP_MEMALLOC;
 341         static_key_slow_inc(&memalloc_socks);
 342 }
 343 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 344
 345 void sk_clear_memalloc(struct sock *sk)
 346 {
 347         sock_reset_flag(sk, SOCK_MEMALLOC);
 348         sk->sk_allocation &= ~__GFP_MEMALLOC;
 349         static_key_slow_dec(&memalloc_socks);
 350
 351         /*
 352          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 353          * progress of swapping. SOCK_MEMALLOC may be cleared while
 354          * it has rmem allocations due to the last swapfile being deactivated
 355          * but there is a risk that the socket is unusable due to exceeding
 356          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 357          */
 358         sk_mem_reclaim(sk);
 359 }
 360 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 361
 362 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 363 {
 364         int ret;
 365         unsigned int noreclaim_flag;
 366
 367         /* these should have been dropped before queueing */
 368         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 369
 370         noreclaim_flag = memalloc_noreclaim_save();
 371         ret = sk->sk_backlog_rcv(sk, skb);
 372         memalloc_noreclaim_restore(noreclaim_flag);
 373
 374         return ret;
 375 }
 376 EXPORT_SYMBOL(__sk_backlog_rcv);
 377
 378 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 379 {
 380         struct timeval tv;
 381
 382         if (optlen < sizeof(tv))
 383                 return -EINVAL;
 384         if (copy_from_user(&tv, optval, sizeof(tv)))
 385                 return -EFAULT;
 386         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 387                 return -EDOM;
 388
 389         if (tv.tv_sec < 0) {
 390                 static int warned __read_mostly;
 391
 392                 *timeo_p = 0;
 393                 if (warned < 10 && net_ratelimit()) {
 394                         warned++;
 395                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 396                                 __func__, current->comm, task_pid_nr(current));
 397                 }
 398                 return 0;
 399         }
 400         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 401         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 402                 return 0;
 403         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 404                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 405         return 0;
 406 }
 407
 408 static void sock_warn_obsolete_bsdism(const char *name)
 409 {
 410         static int warned;
 411         static char warncomm[TASK_COMM_LEN];
 412         if (strcmp(warncomm, current->comm) && warned < 5) {
 413                 strcpy(warncomm,  current->comm);
 414                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 415                         warncomm, name);
 416                 warned++;
 417         }
 418 }
 419
 420 static bool sock_needs_netstamp(const struct sock *sk)
 421 {
 422         switch (sk->sk_family) {
 423         case AF_UNSPEC:
 424         case AF_UNIX:
 425                 return false;
 426         default:
 427                 return true;
 428         }
 429 }
 430
 431 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 432 {
 433         if (sk->sk_flags & flags) {
 434                 sk->sk_flags &= ~flags;
 435                 if (sock_needs_netstamp(sk) &&
 436                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 437                         net_disable_timestamp();
 438         }
 439 }
 440
 441
 442 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 443 {
 444         unsigned long flags;
 445         struct sk_buff_head *list = &sk->sk_receive_queue;
 446
 447         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 448                 atomic_inc(&sk->sk_drops);
 449                 trace_sock_rcvqueue_full(sk, skb);
 450                 return -ENOMEM;
 451         }
 452
 453         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 454                 atomic_inc(&sk->sk_drops);
 455                 return -ENOBUFS;
 456         }
 457
 458         skb->dev = NULL;
 459         skb_set_owner_r(skb, sk);
 460
 461         /* we escape from rcu protected region, make sure we dont leak
 462          * a norefcounted dst
 463          */
 464         skb_dst_force(skb);
 465
 466         spin_lock_irqsave(&list->lock, flags);
 467         sock_skb_set_dropcount(sk, skb);
 468         __skb_queue_tail(list, skb);
 469         spin_unlock_irqrestore(&list->lock, flags);
 470
 471         if (!sock_flag(sk, SOCK_DEAD))
 472                 sk->sk_data_ready(sk);
 473         return 0;
 474 }
 475 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 476
 477 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 478 {
 479         int err;
 480
 481         err = sk_filter(sk, skb);
 482         if (err)
 483                 return err;
 484
 485         return __sock_queue_rcv_skb(sk, skb);
 486 }
 487 EXPORT_SYMBOL(sock_queue_rcv_skb);
 488
 489 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 490                      const int nested, unsigned int trim_cap, bool refcounted)
 491 {
 492         int rc = NET_RX_SUCCESS;
 493
 494         if (sk_filter_trim_cap(sk, skb, trim_cap))
 495                 goto discard_and_relse;
 496
 497         skb->dev = NULL;
 498
 499         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 500                 atomic_inc(&sk->sk_drops);
 501                 goto discard_and_relse;
 502         }
 503         if (nested)
 504                 bh_lock_sock_nested(sk);
 505         else
 506                 bh_lock_sock(sk);
 507         if (!sock_owned_by_user(sk)) {
 508                 /*
 509                  * trylock + unlock semantics:
 510                  */
 511                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 512
 513                 rc = sk_backlog_rcv(sk, skb);
 514
 515                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 516         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 517                 bh_unlock_sock(sk);
 518                 atomic_inc(&sk->sk_drops);
 519                 goto discard_and_relse;
 520         }
 521
 522         bh_unlock_sock(sk);
 523 out:
 524         if (refcounted)
 525                 sock_put(sk);
 526         return rc;
 527 discard_and_relse:
 528         kfree_skb(skb);
 529         goto out;
 530 }
 531 EXPORT_SYMBOL(__sk_receive_skb);
 532
 533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 534 {
 535         struct dst_entry *dst = __sk_dst_get(sk);
 536
 537         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 538                 sk_tx_queue_clear(sk);
 539                 sk->sk_dst_pending_confirm = 0;
 540                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 541                 dst_release(dst);
 542                 return NULL;
 543         }
 544
 545         return dst;
 546 }
 547 EXPORT_SYMBOL(__sk_dst_check);
 548
 549 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 550 {
 551         struct dst_entry *dst = sk_dst_get(sk);
 552
 553         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 554                 sk_dst_reset(sk);
 555                 dst_release(dst);
 556                 return NULL;
 557         }
 558
 559         return dst;
 560 }
 561 EXPORT_SYMBOL(sk_dst_check);
 562
 563 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 564                                 int optlen)
 565 {
 566         int ret = -ENOPROTOOPT;
 567 #ifdef CONFIG_NETDEVICES
 568         struct net *net = sock_net(sk);
 569         char devname[IFNAMSIZ];
 570         int index;
 571
 572         /* Sorry... */
 573         ret = -EPERM;
 574         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 575                 goto out;
 576
 577         ret = -EINVAL;
 578         if (optlen < 0)
 579                 goto out;
 580
 581         /* Bind this socket to a particular device like "eth0",
 582          * as specified in the passed interface name. If the
 583          * name is "" or the option length is zero the socket
 584          * is not bound.
 585          */
 586         if (optlen > IFNAMSIZ - 1)
 587                 optlen = IFNAMSIZ - 1;
 588         memset(devname, 0, sizeof(devname));
 589
 590         ret = -EFAULT;
 591         if (copy_from_user(devname, optval, optlen))
 592                 goto out;
 593
 594         index = 0;
 595         if (devname[0] != '\0') {
 596                 struct net_device *dev;
 597
 598                 rcu_read_lock();
 599                 dev = dev_get_by_name_rcu(net, devname);
 600                 if (dev)
 601                         index = dev->ifindex;
 602                 rcu_read_unlock();
 603                 ret = -ENODEV;
 604                 if (!dev)
 605                         goto out;
 606         }
 607
 608         lock_sock(sk);
 609         sk->sk_bound_dev_if = index;
 610         sk_dst_reset(sk);
 611         release_sock(sk);
 612
 613         ret = 0;
 614
 615 out:
 616 #endif
 617
 618         return ret;
 619 }
 620
 621 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 622                                 int __user *optlen, int len)
 623 {
 624         int ret = -ENOPROTOOPT;
 625 #ifdef CONFIG_NETDEVICES
 626         struct net *net = sock_net(sk);
 627         char devname[IFNAMSIZ];
 628
 629         if (sk->sk_bound_dev_if == 0) {
 630                 len = 0;
 631                 goto zero;
 632         }
 633
 634         ret = -EINVAL;
 635         if (len < IFNAMSIZ)
 636                 goto out;
 637
 638         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 639         if (ret)
 640                 goto out;
 641
 642         len = strlen(devname) + 1;
 643
 644         ret = -EFAULT;
 645         if (copy_to_user(optval, devname, len))
 646                 goto out;
 647
 648 zero:
 649         ret = -EFAULT;
 650         if (put_user(len, optlen))
 651                 goto out;
 652
 653         ret = 0;
 654
 655 out:
 656 #endif
 657
 658         return ret;
 659 }
 660
 661 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 662 {
 663         if (valbool)
 664                 sock_set_flag(sk, bit);
 665         else
 666                 sock_reset_flag(sk, bit);
 667 }
 668
 669 bool sk_mc_loop(struct sock *sk)
 670 {
 671         if (dev_recursion_level())
 672                 return false;
 673         if (!sk)
 674                 return true;
 675         switch (sk->sk_family) {
 676         case AF_INET:
 677                 return inet_sk(sk)->mc_loop;
 678 #if IS_ENABLED(CONFIG_IPV6)
 679         case AF_INET6:
 680                 return inet6_sk(sk)->mc_loop;
 681 #endif
 682         }
 683         WARN_ON(1);
 684         return true;
 685 }
 686 EXPORT_SYMBOL(sk_mc_loop);
 687
 688 /*
 689  *      This is meant for all protocols to use and covers goings on
 690  *      at the socket level. Everything here is generic.
 691  */
 692
 693 int sock_setsockopt(struct socket *sock, int level, int optname,
 694                     char __user *optval, unsigned int optlen)
 695 {
 696         struct sock *sk = sock->sk;
 697         int val;
 698         int valbool;
 699         struct linger ling;
 700         int ret = 0;
 701
 702         /*
 703          *      Options without arguments
 704          */
 705
 706         if (optname == SO_BINDTODEVICE)
 707                 return sock_setbindtodevice(sk, optval, optlen);
 708
 709         if (optlen < sizeof(int))
 710                 return -EINVAL;
 711
 712         if (get_user(val, (int __user *)optval))
 713                 return -EFAULT;
 714
 715         valbool = val ? 1 : 0;
 716
 717         lock_sock(sk);
 718
 719         switch (optname) {
 720         case SO_DEBUG:
 721                 if (val && !capable(CAP_NET_ADMIN))
 722                         ret = -EACCES;
 723                 else
 724                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 725                 break;
 726         case SO_REUSEADDR:
 727                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 728                 break;
 729         case SO_REUSEPORT:
 730                 sk->sk_reuseport = valbool;
 731                 break;
 732         case SO_TYPE:
 733         case SO_PROTOCOL:
 734         case SO_DOMAIN:
 735         case SO_ERROR:
 736                 ret = -ENOPROTOOPT;
 737                 break;
 738         case SO_DONTROUTE:
 739                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 740                 break;
 741         case SO_BROADCAST:
 742                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 743                 break;
 744         case SO_SNDBUF:
 745                 /* Don't error on this BSD doesn't and if you think
 746                  * about it this is right. Otherwise apps have to
 747                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 748                  * are treated in BSD as hints
 749                  */
 750                 val = min_t(u32, val, sysctl_wmem_max);
 751 set_sndbuf:
 752                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 753                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 754                 /* Wake up sending tasks if we upped the value. */
 755                 sk->sk_write_space(sk);
 756                 break;
 757
 758         case SO_SNDBUFFORCE:
 759                 if (!capable(CAP_NET_ADMIN)) {
 760                         ret = -EPERM;
 761                         break;
 762                 }
 763                 goto set_sndbuf;
 764
 765         case SO_RCVBUF:
 766                 /* Don't error on this BSD doesn't and if you think
 767                  * about it this is right. Otherwise apps have to
 768                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 769                  * are treated in BSD as hints
 770                  */
 771                 val = min_t(u32, val, sysctl_rmem_max);
 772 set_rcvbuf:
 773                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 774                 /*
 775                  * We double it on the way in to account for
 776                  * "struct sk_buff" etc. overhead.   Applications
 777                  * assume that the SO_RCVBUF setting they make will
 778                  * allow that much actual data to be received on that
 779                  * socket.
 780                  *
 781                  * Applications are unaware that "struct sk_buff" and
 782                  * other overheads allocate from the receive buffer
 783                  * during socket buffer allocation.
 784                  *
 785                  * And after considering the possible alternatives,
 786                  * returning the value we actually used in getsockopt
 787                  * is the most desirable behavior.
 788                  */
 789                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 790                 break;
 791
 792         case SO_RCVBUFFORCE:
 793                 if (!capable(CAP_NET_ADMIN)) {
 794                         ret = -EPERM;
 795                         break;
 796                 }
 797                 goto set_rcvbuf;
 798
 799         case SO_KEEPALIVE:
 800                 if (sk->sk_prot->keepalive)
 801                         sk->sk_prot->keepalive(sk, valbool);
 802                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 803                 break;
 804
 805         case SO_OOBINLINE:
 806                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 807                 break;
 808
 809         case SO_NO_CHECK:
 810                 sk->sk_no_check_tx = valbool;
 811                 break;
 812
 813         case SO_PRIORITY:
 814                 if ((val >= 0 && val <= 6) ||
 815                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 816                         sk->sk_priority = val;
 817                 else
 818                         ret = -EPERM;
 819                 break;
 820
 821         case SO_LINGER:
 822                 if (optlen < sizeof(ling)) {
 823                         ret = -EINVAL;  /* 1003.1g */
 824                         break;
 825                 }
 826                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 827                         ret = -EFAULT;
 828                         break;
 829                 }
 830                 if (!ling.l_onoff)
 831                         sock_reset_flag(sk, SOCK_LINGER);
 832                 else {
 833 #if (BITS_PER_LONG == 32)
 834                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 835                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 836                         else
 837 #endif
 838                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 839                         sock_set_flag(sk, SOCK_LINGER);
 840                 }
 841                 break;
 842
 843         case SO_BSDCOMPAT:
 844                 sock_warn_obsolete_bsdism("setsockopt");
 845                 break;
 846
 847         case SO_PASSCRED:
 848                 if (valbool)
 849                         set_bit(SOCK_PASSCRED, &sock->flags);
 850                 else
 851                         clear_bit(SOCK_PASSCRED, &sock->flags);
 852                 break;
 853
 854         case SO_TIMESTAMP:
 855         case SO_TIMESTAMPNS:
 856                 if (valbool)  {
 857                         if (optname == SO_TIMESTAMP)
 858                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 859                         else
 860                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 861                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 862                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 863                 } else {
 864                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 865                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 866                 }
 867                 break;
 868
 869         case SO_TIMESTAMPING:
 870                 if (val & ~SOF_TIMESTAMPING_MASK) {
 871                         ret = -EINVAL;
 872                         break;
 873                 }
 874
 875                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 876                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 877                         if (sk->sk_protocol == IPPROTO_TCP &&
 878                             sk->sk_type == SOCK_STREAM) {
 879                                 if ((1 << sk->sk_state) &
 880                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 881                                         ret = -EINVAL;
 882                                         break;
 883                                 }
 884                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 885                         } else {
 886                                 sk->sk_tskey = 0;
 887                         }
 888                 }
 889
 890                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 891                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 892                         ret = -EINVAL;
 893                         break;
 894                 }
 895
 896                 sk->sk_tsflags = val;
 897                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 898                         sock_enable_timestamp(sk,
 899                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 900                 else
 901                         sock_disable_timestamp(sk,
 902                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 903                 break;
 904
 905         case SO_RCVLOWAT:
 906                 if (val < 0)
 907                         val = INT_MAX;
 908                 if (sock->ops->set_rcvlowat)
 909                         ret = sock->ops->set_rcvlowat(sk, val);
 910                 else
 911                         sk->sk_rcvlowat = val ? : 1;
 912                 break;
 913
 914         case SO_RCVTIMEO:
 915                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 916                 break;
 917
 918         case SO_SNDTIMEO:
 919                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 920                 break;
 921
 922         case SO_ATTACH_FILTER:
 923                 ret = -EINVAL;
 924                 if (optlen == sizeof(struct sock_fprog)) {
 925                         struct sock_fprog fprog;
 926
 927                         ret = -EFAULT;
 928                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 929                                 break;
 930
 931                         ret = sk_attach_filter(&fprog, sk);
 932                 }
 933                 break;
 934
 935         case SO_ATTACH_BPF:
 936                 ret = -EINVAL;
 937                 if (optlen == sizeof(u32)) {
 938                         u32 ufd;
 939
 940                         ret = -EFAULT;
 941                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 942                                 break;
 943
 944                         ret = sk_attach_bpf(ufd, sk);
 945                 }
 946                 break;
 947
 948         case SO_ATTACH_REUSEPORT_CBPF:
 949                 ret = -EINVAL;
 950                 if (optlen == sizeof(struct sock_fprog)) {
 951                         struct sock_fprog fprog;
 952
 953                         ret = -EFAULT;
 954                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 955                                 break;
 956
 957                         ret = sk_reuseport_attach_filter(&fprog, sk);
 958                 }
 959                 break;
 960
 961         case SO_ATTACH_REUSEPORT_EBPF:
 962                 ret = -EINVAL;
 963                 if (optlen == sizeof(u32)) {
 964                         u32 ufd;
 965
 966                         ret = -EFAULT;
 967                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 968                                 break;
 969
 970                         ret = sk_reuseport_attach_bpf(ufd, sk);
 971                 }
 972                 break;
 973
 974         case SO_DETACH_FILTER:
 975                 ret = sk_detach_filter(sk);
 976                 break;
 977
 978         case SO_LOCK_FILTER:
 979                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 980                         ret = -EPERM;
 981                 else
 982                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 983                 break;
 984
 985         case SO_PASSSEC:
 986                 if (valbool)
 987                         set_bit(SOCK_PASSSEC, &sock->flags);
 988                 else
 989                         clear_bit(SOCK_PASSSEC, &sock->flags);
 990                 break;
 991         case SO_MARK:
 992                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 993                         ret = -EPERM;
 994                 else
 995                         sk->sk_mark = val;
 996                 break;
 997
 998         case SO_RXQ_OVFL:
 999                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1000                 break;
1001
1002         case SO_WIFI_STATUS:
1003                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1004                 break;
1005
1006         case SO_PEEK_OFF:
1007                 if (sock->ops->set_peek_off)
1008                         ret = sock->ops->set_peek_off(sk, val);
1009                 else
1010                         ret = -EOPNOTSUPP;
1011                 break;
1012
1013         case SO_NOFCS:
1014                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1015                 break;
1016
1017         case SO_SELECT_ERR_QUEUE:
1018                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1019                 break;
1020
1021 #ifdef CONFIG_NET_RX_BUSY_POLL
1022         case SO_BUSY_POLL:
1023                 /* allow unprivileged users to decrease the value */
1024                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1025                         ret = -EPERM;
1026                 else {
1027                         if (val < 0)
1028                                 ret = -EINVAL;
1029                         else
1030                                 sk->sk_ll_usec = val;
1031                 }
1032                 break;
1033 #endif
1034
1035         case SO_MAX_PACING_RATE:
1036                 if (val != ~0U)
1037                         cmpxchg(&sk->sk_pacing_status,
1038                                 SK_PACING_NONE,
1039                                 SK_PACING_NEEDED);
1040                 sk->sk_max_pacing_rate = val;
1041                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1042                                          sk->sk_max_pacing_rate);
1043                 break;
1044
1045         case SO_INCOMING_CPU:
1046                 sk->sk_incoming_cpu = val;
1047                 break;
1048
1049         case SO_CNX_ADVICE:
1050                 if (val == 1)
1051                         dst_negative_advice(sk);
1052                 break;
1053
1054         case SO_ZEROCOPY:
1055                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1056                         if (sk->sk_protocol != IPPROTO_TCP)
1057                                 ret = -ENOTSUPP;
1058                 } else if (sk->sk_family != PF_RDS) {
1059                         ret = -ENOTSUPP;
1060                 }
1061                 if (!ret) {
1062                         if (val < 0 || val > 1)
1063                                 ret = -EINVAL;
1064                         else
1065                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1066                 }
1067                 break;
1068
1069         default:
1070                 ret = -ENOPROTOOPT;
1071                 break;
1072         }
1073         release_sock(sk);
1074         return ret;
1075 }
1076 EXPORT_SYMBOL(sock_setsockopt);
1077
1078
1079 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1080                           struct ucred *ucred)
1081 {
1082         ucred->pid = pid_vnr(pid);
1083         ucred->uid = ucred->gid = -1;
1084         if (cred) {
1085                 struct user_namespace *current_ns = current_user_ns();
1086
1087                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1088                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1089         }
1090 }
1091
1092 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1093 {
1094         struct user_namespace *user_ns = current_user_ns();
1095         int i;
1096
1097         for (i = 0; i < src->ngroups; i++)
1098                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1099                         return -EFAULT;
1100
1101         return 0;
1102 }
1103
1104 int sock_getsockopt(struct socket *sock, int level, int optname,
1105                     char __user *optval, int __user *optlen)
1106 {
1107         struct sock *sk = sock->sk;
1108
1109         union {
1110                 int val;
1111                 u64 val64;
1112                 struct linger ling;
1113                 struct timeval tm;
1114         } v;
1115
1116         int lv = sizeof(int);
1117         int len;
1118
1119         if (get_user(len, optlen))
1120                 return -EFAULT;
1121         if (len < 0)
1122                 return -EINVAL;
1123
1124         memset(&v, 0, sizeof(v));
1125
1126         switch (optname) {
1127         case SO_DEBUG:
1128                 v.val = sock_flag(sk, SOCK_DBG);
1129                 break;
1130
1131         case SO_DONTROUTE:
1132                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1133                 break;
1134
1135         case SO_BROADCAST:
1136                 v.val = sock_flag(sk, SOCK_BROADCAST);
1137                 break;
1138
1139         case SO_SNDBUF:
1140                 v.val = sk->sk_sndbuf;
1141                 break;
1142
1143         case SO_RCVBUF:
1144                 v.val = sk->sk_rcvbuf;
1145                 break;
1146
1147         case SO_REUSEADDR:
1148                 v.val = sk->sk_reuse;
1149                 break;
1150
1151         case SO_REUSEPORT:
1152                 v.val = sk->sk_reuseport;
1153                 break;
1154
1155         case SO_KEEPALIVE:
1156                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1157                 break;
1158
1159         case SO_TYPE:
1160                 v.val = sk->sk_type;
1161                 break;
1162
1163         case SO_PROTOCOL:
1164                 v.val = sk->sk_protocol;
1165                 break;
1166
1167         case SO_DOMAIN:
1168                 v.val = sk->sk_family;
1169                 break;
1170
1171         case SO_ERROR:
1172                 v.val = -sock_error(sk);
1173                 if (v.val == 0)
1174                         v.val = xchg(&sk->sk_err_soft, 0);
1175                 break;
1176
1177         case SO_OOBINLINE:
1178                 v.val = sock_flag(sk, SOCK_URGINLINE);
1179                 break;
1180
1181         case SO_NO_CHECK:
1182                 v.val = sk->sk_no_check_tx;
1183                 break;
1184
1185         case SO_PRIORITY:
1186                 v.val = sk->sk_priority;
1187                 break;
1188
1189         case SO_LINGER:
1190                 lv              = sizeof(v.ling);
1191                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1192                 v.ling.l_linger = sk->sk_lingertime / HZ;
1193                 break;
1194
1195         case SO_BSDCOMPAT:
1196                 sock_warn_obsolete_bsdism("getsockopt");
1197                 break;
1198
1199         case SO_TIMESTAMP:
1200                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1201                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1202                 break;
1203
1204         case SO_TIMESTAMPNS:
1205                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1206                 break;
1207
1208         case SO_TIMESTAMPING:
1209                 v.val = sk->sk_tsflags;
1210                 break;
1211
1212         case SO_RCVTIMEO:
1213                 lv = sizeof(struct timeval);
1214                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1215                         v.tm.tv_sec = 0;
1216                         v.tm.tv_usec = 0;
1217                 } else {
1218                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1219                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1220                 }
1221                 break;
1222
1223         case SO_SNDTIMEO:
1224                 lv = sizeof(struct timeval);
1225                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1226                         v.tm.tv_sec = 0;
1227                         v.tm.tv_usec = 0;
1228                 } else {
1229                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1230                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1231                 }
1232                 break;
1233
1234         case SO_RCVLOWAT:
1235                 v.val = sk->sk_rcvlowat;
1236                 break;
1237
1238         case SO_SNDLOWAT:
1239                 v.val = 1;
1240                 break;
1241
1242         case SO_PASSCRED:
1243                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1244                 break;
1245
1246         case SO_PEERCRED:
1247         {
1248                 struct ucred peercred;
1249                 if (len > sizeof(peercred))
1250                         len = sizeof(peercred);
1251                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1252                 if (copy_to_user(optval, &peercred, len))
1253                         return -EFAULT;
1254                 goto lenout;
1255         }
1256
1257         case SO_PEERGROUPS:
1258         {
1259                 int ret, n;
1260
1261                 if (!sk->sk_peer_cred)
1262                         return -ENODATA;
1263
1264                 n = sk->sk_peer_cred->group_info->ngroups;
1265                 if (len < n * sizeof(gid_t)) {
1266                         len = n * sizeof(gid_t);
1267                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1268                 }
1269                 len = n * sizeof(gid_t);
1270
1271                 ret = groups_to_user((gid_t __user *)optval,
1272                                      sk->sk_peer_cred->group_info);
1273                 if (ret)
1274                         return ret;
1275                 goto lenout;
1276         }
1277
1278         case SO_PEERNAME:
1279         {
1280                 char address[128];
1281
1282                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1283                 if (lv < 0)
1284                         return -ENOTCONN;
1285                 if (lv < len)
1286                         return -EINVAL;
1287                 if (copy_to_user(optval, address, len))
1288                         return -EFAULT;
1289                 goto lenout;
1290         }
1291
1292         /* Dubious BSD thing... Probably nobody even uses it, but
1293          * the UNIX standard wants it for whatever reason... -DaveM
1294          */
1295         case SO_ACCEPTCONN:
1296                 v.val = sk->sk_state == TCP_LISTEN;
1297                 break;
1298
1299         case SO_PASSSEC:
1300                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1301                 break;
1302
1303         case SO_PEERSEC:
1304                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1305
1306         case SO_MARK:
1307                 v.val = sk->sk_mark;
1308                 break;
1309
1310         case SO_RXQ_OVFL:
1311                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1312                 break;
1313
1314         case SO_WIFI_STATUS:
1315                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1316                 break;
1317
1318         case SO_PEEK_OFF:
1319                 if (!sock->ops->set_peek_off)
1320                         return -EOPNOTSUPP;
1321
1322                 v.val = sk->sk_peek_off;
1323                 break;
1324         case SO_NOFCS:
1325                 v.val = sock_flag(sk, SOCK_NOFCS);
1326                 break;
1327
1328         case SO_BINDTODEVICE:
1329                 return sock_getbindtodevice(sk, optval, optlen, len);
1330
1331         case SO_GET_FILTER:
1332                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1333                 if (len < 0)
1334                         return len;
1335
1336                 goto lenout;
1337
1338         case SO_LOCK_FILTER:
1339                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1340                 break;
1341
1342         case SO_BPF_EXTENSIONS:
1343                 v.val = bpf_tell_extensions();
1344                 break;
1345
1346         case SO_SELECT_ERR_QUEUE:
1347                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1348                 break;
1349
1350 #ifdef CONFIG_NET_RX_BUSY_POLL
1351         case SO_BUSY_POLL:
1352                 v.val = sk->sk_ll_usec;
1353                 break;
1354 #endif
1355
1356         case SO_MAX_PACING_RATE:
1357                 v.val = sk->sk_max_pacing_rate;
1358                 break;
1359
1360         case SO_INCOMING_CPU:
1361                 v.val = sk->sk_incoming_cpu;
1362                 break;
1363
1364         case SO_MEMINFO:
1365         {
1366                 u32 meminfo[SK_MEMINFO_VARS];
1367
1368                 if (get_user(len, optlen))
1369                         return -EFAULT;
1370
1371                 sk_get_meminfo(sk, meminfo);
1372
1373                 len = min_t(unsigned int, len, sizeof(meminfo));
1374                 if (copy_to_user(optval, &meminfo, len))
1375                         return -EFAULT;
1376
1377                 goto lenout;
1378         }
1379
1380 #ifdef CONFIG_NET_RX_BUSY_POLL
1381         case SO_INCOMING_NAPI_ID:
1382                 v.val = READ_ONCE(sk->sk_napi_id);
1383
1384                 /* aggregate non-NAPI IDs down to 0 */
1385                 if (v.val < MIN_NAPI_ID)
1386                         v.val = 0;
1387
1388                 break;
1389 #endif
1390
1391         case SO_COOKIE:
1392                 lv = sizeof(u64);
1393                 if (len < lv)
1394                         return -EINVAL;
1395                 v.val64 = sock_gen_cookie(sk);
1396                 break;
1397
1398         case SO_ZEROCOPY:
1399                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1400                 break;
1401
1402         default:
1403                 /* We implement the SO_SNDLOWAT etc to not be settable
1404                  * (1003.1g 7).
1405                  */
1406                 return -ENOPROTOOPT;
1407         }
1408
1409         if (len > lv)
1410                 len = lv;
1411         if (copy_to_user(optval, &v, len))
1412                 return -EFAULT;
1413 lenout:
1414         if (put_user(len, optlen))
1415                 return -EFAULT;
1416         return 0;
1417 }
1418
1419 /*
1420  * Initialize an sk_lock.
1421  *
1422  * (We also register the sk_lock with the lock validator.)
1423  */
1424 static inline void sock_lock_init(struct sock *sk)
1425 {
1426         if (sk->sk_kern_sock)
1427                 sock_lock_init_class_and_name(
1428                         sk,
1429                         af_family_kern_slock_key_strings[sk->sk_family],
1430                         af_family_kern_slock_keys + sk->sk_family,
1431                         af_family_kern_key_strings[sk->sk_family],
1432                         af_family_kern_keys + sk->sk_family);
1433         else
1434                 sock_lock_init_class_and_name(
1435                         sk,
1436                         af_family_slock_key_strings[sk->sk_family],
1437                         af_family_slock_keys + sk->sk_family,
1438                         af_family_key_strings[sk->sk_family],
1439                         af_family_keys + sk->sk_family);
1440 }
1441
1442 /*
1443  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1444  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1445  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1446  */
1447 static void sock_copy(struct sock *nsk, const struct sock *osk)
1448 {
1449 #ifdef CONFIG_SECURITY_NETWORK
1450         void *sptr = nsk->sk_security;
1451 #endif
1452         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1453
1454         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1455                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1456
1457 #ifdef CONFIG_SECURITY_NETWORK
1458         nsk->sk_security = sptr;
1459         security_sk_clone(osk, nsk);
1460 #endif
1461 }
1462
1463 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1464                 int family)
1465 {
1466         struct sock *sk;
1467         struct kmem_cache *slab;
1468
1469         slab = prot->slab;
1470         if (slab != NULL) {
1471                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1472                 if (!sk)
1473                         return sk;
1474                 if (priority & __GFP_ZERO)
1475                         sk_prot_clear_nulls(sk, prot->obj_size);
1476         } else
1477                 sk = kmalloc(prot->obj_size, priority);
1478
1479         if (sk != NULL) {
1480                 if (security_sk_alloc(sk, family, priority))
1481                         goto out_free;
1482
1483                 if (!try_module_get(prot->owner))
1484                         goto out_free_sec;
1485                 sk_tx_queue_clear(sk);
1486         }
1487
1488         return sk;
1489
1490 out_free_sec:
1491         security_sk_free(sk);
1492 out_free:
1493         if (slab != NULL)
1494                 kmem_cache_free(slab, sk);
1495         else
1496                 kfree(sk);
1497         return NULL;
1498 }
1499
1500 static void sk_prot_free(struct proto *prot, struct sock *sk)
1501 {
1502         struct kmem_cache *slab;
1503         struct module *owner;
1504
1505         owner = prot->owner;
1506         slab = prot->slab;
1507
1508         cgroup_sk_free(&sk->sk_cgrp_data);
1509         mem_cgroup_sk_free(sk);
1510         security_sk_free(sk);
1511         if (slab != NULL)
1512                 kmem_cache_free(slab, sk);
1513         else
1514                 kfree(sk);
1515         module_put(owner);
1516 }
1517
1518 /**
1519  *      sk_alloc - All socket objects are allocated here
1520  *      @net: the applicable net namespace
1521  *      @family: protocol family
1522  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1523  *      @prot: struct proto associated with this new sock instance
1524  *      @kern: is this to be a kernel socket?
1525  */
1526 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1527                       struct proto *prot, int kern)
1528 {
1529         struct sock *sk;
1530
1531         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1532         if (sk) {
1533                 sk->sk_family = family;
1534                 /*
1535                  * See comment in struct sock definition to understand
1536                  * why we need sk_prot_creator -acme
1537                  */
1538                 sk->sk_prot = sk->sk_prot_creator = prot;
1539                 sk->sk_kern_sock = kern;
1540                 sock_lock_init(sk);
1541                 sk->sk_net_refcnt = kern ? 0 : 1;
1542                 if (likely(sk->sk_net_refcnt)) {
1543                         get_net(net);
1544                         sock_inuse_add(net, 1);
1545                 }
1546
1547                 sock_net_set(sk, net);
1548                 refcount_set(&sk->sk_wmem_alloc, 1);
1549
1550                 mem_cgroup_sk_alloc(sk);
1551                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1552                 sock_update_classid(&sk->sk_cgrp_data);
1553                 sock_update_netprioidx(&sk->sk_cgrp_data);
1554         }
1555
1556         return sk;
1557 }
1558 EXPORT_SYMBOL(sk_alloc);
1559
1560 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1561  * grace period. This is the case for UDP sockets and TCP listeners.
1562  */
1563 static void __sk_destruct(struct rcu_head *head)
1564 {
1565         struct sock *sk = container_of(head, struct sock, sk_rcu);
1566         struct sk_filter *filter;
1567
1568         if (sk->sk_destruct)
1569                 sk->sk_destruct(sk);
1570
1571         filter = rcu_dereference_check(sk->sk_filter,
1572                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1573         if (filter) {
1574                 sk_filter_uncharge(sk, filter);
1575                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1576         }
1577         if (rcu_access_pointer(sk->sk_reuseport_cb))
1578                 reuseport_detach_sock(sk);
1579
1580         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1581
1582         if (atomic_read(&sk->sk_omem_alloc))
1583                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1584                          __func__, atomic_read(&sk->sk_omem_alloc));
1585
1586         if (sk->sk_frag.page) {
1587                 put_page(sk->sk_frag.page);
1588                 sk->sk_frag.page = NULL;
1589         }
1590
1591         if (sk->sk_peer_cred)
1592                 put_cred(sk->sk_peer_cred);
1593         put_pid(sk->sk_peer_pid);
1594         if (likely(sk->sk_net_refcnt))
1595                 put_net(sock_net(sk));
1596         sk_prot_free(sk->sk_prot_creator, sk);
1597 }
1598
1599 void sk_destruct(struct sock *sk)
1600 {
1601         if (sock_flag(sk, SOCK_RCU_FREE))
1602                 call_rcu(&sk->sk_rcu, __sk_destruct);
1603         else
1604                 __sk_destruct(&sk->sk_rcu);
1605 }
1606
1607 static void __sk_free(struct sock *sk)
1608 {
1609         if (likely(sk->sk_net_refcnt))
1610                 sock_inuse_add(sock_net(sk), -1);
1611
1612         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1613                 sock_diag_broadcast_destroy(sk);
1614         else
1615                 sk_destruct(sk);
1616 }
1617
1618 void sk_free(struct sock *sk)
1619 {
1620         /*
1621          * We subtract one from sk_wmem_alloc and can know if
1622          * some packets are still in some tx queue.
1623          * If not null, sock_wfree() will call __sk_free(sk) later
1624          */
1625         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1626                 __sk_free(sk);
1627 }
1628 EXPORT_SYMBOL(sk_free);
1629
1630 static void sk_init_common(struct sock *sk)
1631 {
1632         skb_queue_head_init(&sk->sk_receive_queue);
1633         skb_queue_head_init(&sk->sk_write_queue);
1634         skb_queue_head_init(&sk->sk_error_queue);
1635
1636         rwlock_init(&sk->sk_callback_lock);
1637         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1638                         af_rlock_keys + sk->sk_family,
1639                         af_family_rlock_key_strings[sk->sk_family]);
1640         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1641                         af_wlock_keys + sk->sk_family,
1642                         af_family_wlock_key_strings[sk->sk_family]);
1643         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1644                         af_elock_keys + sk->sk_family,
1645                         af_family_elock_key_strings[sk->sk_family]);
1646         lockdep_set_class_and_name(&sk->sk_callback_lock,
1647                         af_callback_keys + sk->sk_family,
1648                         af_family_clock_key_strings[sk->sk_family]);
1649 }
1650
1651 /**
1652  *      sk_clone_lock - clone a socket, and lock its clone
1653  *      @sk: the socket to clone
1654  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1655  *
1656  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1657  */
1658 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1659 {
1660         struct sock *newsk;
1661         bool is_charged = true;
1662
1663         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1664         if (newsk != NULL) {
1665                 struct sk_filter *filter;
1666
1667                 sock_copy(newsk, sk);
1668
1669                 newsk->sk_prot_creator = sk->sk_prot;
1670
1671                 /* SANITY */
1672                 if (likely(newsk->sk_net_refcnt))
1673                         get_net(sock_net(newsk));
1674                 sk_node_init(&newsk->sk_node);
1675                 sock_lock_init(newsk);
1676                 bh_lock_sock(newsk);
1677                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1678                 newsk->sk_backlog.len = 0;
1679
1680                 atomic_set(&newsk->sk_rmem_alloc, 0);
1681                 /*
1682                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1683                  */
1684                 refcount_set(&newsk->sk_wmem_alloc, 1);
1685                 atomic_set(&newsk->sk_omem_alloc, 0);
1686                 sk_init_common(newsk);
1687
1688                 newsk->sk_dst_cache     = NULL;
1689                 newsk->sk_dst_pending_confirm = 0;
1690                 newsk->sk_wmem_queued   = 0;
1691                 newsk->sk_forward_alloc = 0;
1692                 atomic_set(&newsk->sk_drops, 0);
1693                 newsk->sk_send_head     = NULL;
1694                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1695                 atomic_set(&newsk->sk_zckey, 0);
1696
1697                 sock_reset_flag(newsk, SOCK_DONE);
1698                 mem_cgroup_sk_alloc(newsk);
1699                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1700
1701                 rcu_read_lock();
1702                 filter = rcu_dereference(sk->sk_filter);
1703                 if (filter != NULL)
1704                         /* though it's an empty new sock, the charging may fail
1705                          * if sysctl_optmem_max was changed between creation of
1706                          * original socket and cloning
1707                          */
1708                         is_charged = sk_filter_charge(newsk, filter);
1709                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1710                 rcu_read_unlock();
1711
1712                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1713                         /* We need to make sure that we don't uncharge the new
1714                          * socket if we couldn't charge it in the first place
1715                          * as otherwise we uncharge the parent's filter.
1716                          */
1717                         if (!is_charged)
1718                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1719                         sk_free_unlock_clone(newsk);
1720                         newsk = NULL;
1721                         goto out;
1722                 }
1723                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1724
1725                 newsk->sk_err      = 0;
1726                 newsk->sk_err_soft = 0;
1727                 newsk->sk_priority = 0;
1728                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1729                 atomic64_set(&newsk->sk_cookie, 0);
1730                 if (likely(newsk->sk_net_refcnt))
1731                         sock_inuse_add(sock_net(newsk), 1);
1732
1733                 /*
1734                  * Before updating sk_refcnt, we must commit prior changes to memory
1735                  * (Documentation/RCU/rculist_nulls.txt for details)
1736                  */
1737                 smp_wmb();
1738                 refcount_set(&newsk->sk_refcnt, 2);
1739
1740                 /*
1741                  * Increment the counter in the same struct proto as the master
1742                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1743                  * is the same as sk->sk_prot->socks, as this field was copied
1744                  * with memcpy).
1745                  *
1746                  * This _changes_ the previous behaviour, where
1747                  * tcp_create_openreq_child always was incrementing the
1748                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1749                  * to be taken into account in all callers. -acme
1750                  */
1751                 sk_refcnt_debug_inc(newsk);
1752                 sk_set_socket(newsk, NULL);
1753                 newsk->sk_wq = NULL;
1754
1755                 if (newsk->sk_prot->sockets_allocated)
1756                         sk_sockets_allocated_inc(newsk);
1757
1758                 if (sock_needs_netstamp(sk) &&
1759                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1760                         net_enable_timestamp();
1761         }
1762 out:
1763         return newsk;
1764 }
1765 EXPORT_SYMBOL_GPL(sk_clone_lock);
1766
1767 void sk_free_unlock_clone(struct sock *sk)
1768 {
1769         /* It is still raw copy of parent, so invalidate
1770          * destructor and make plain sk_free() */
1771         sk->sk_destruct = NULL;
1772         bh_unlock_sock(sk);
1773         sk_free(sk);
1774 }
1775 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1776
1777 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1778 {
1779         u32 max_segs = 1;
1780
1781         sk_dst_set(sk, dst);
1782         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1783         if (sk->sk_route_caps & NETIF_F_GSO)
1784                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1785         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1786         if (sk_can_gso(sk)) {
1787                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1788                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1789                 } else {
1790                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1791                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1792                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1793                 }
1794         }
1795         sk->sk_gso_max_segs = max_segs;
1796 }
1797 EXPORT_SYMBOL_GPL(sk_setup_caps);
1798
1799 /*
1800  *      Simple resource managers for sockets.
1801  */
1802
1803
1804 /*
1805  * Write buffer destructor automatically called from kfree_skb.
1806  */
1807 void sock_wfree(struct sk_buff *skb)
1808 {
1809         struct sock *sk = skb->sk;
1810         unsigned int len = skb->truesize;
1811
1812         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1813                 /*
1814                  * Keep a reference on sk_wmem_alloc, this will be released
1815                  * after sk_write_space() call
1816                  */
1817                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1818                 sk->sk_write_space(sk);
1819                 len = 1;
1820         }
1821         /*
1822          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1823          * could not do because of in-flight packets
1824          */
1825         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1826                 __sk_free(sk);
1827 }
1828 EXPORT_SYMBOL(sock_wfree);
1829
1830 /* This variant of sock_wfree() is used by TCP,
1831  * since it sets SOCK_USE_WRITE_QUEUE.
1832  */
1833 void __sock_wfree(struct sk_buff *skb)
1834 {
1835         struct sock *sk = skb->sk;
1836
1837         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1838                 __sk_free(sk);
1839 }
1840
1841 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1842 {
1843         skb_orphan(skb);
1844         skb->sk = sk;
1845 #ifdef CONFIG_INET
1846         if (unlikely(!sk_fullsock(sk))) {
1847                 skb->destructor = sock_edemux;
1848                 sock_hold(sk);
1849                 return;
1850         }
1851 #endif
1852         skb->destructor = sock_wfree;
1853         skb_set_hash_from_sk(skb, sk);
1854         /*
1855          * We used to take a refcount on sk, but following operation
1856          * is enough to guarantee sk_free() wont free this sock until
1857          * all in-flight packets are completed
1858          */
1859         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1860 }
1861 EXPORT_SYMBOL(skb_set_owner_w);
1862
1863 /* This helper is used by netem, as it can hold packets in its
1864  * delay queue. We want to allow the owner socket to send more
1865  * packets, as if they were already TX completed by a typical driver.
1866  * But we also want to keep skb->sk set because some packet schedulers
1867  * rely on it (sch_fq for example).
1868  */
1869 void skb_orphan_partial(struct sk_buff *skb)
1870 {
1871         if (skb_is_tcp_pure_ack(skb))
1872                 return;
1873
1874         if (skb->destructor == sock_wfree
1875 #ifdef CONFIG_INET
1876             || skb->destructor == tcp_wfree
1877 #endif
1878                 ) {
1879                 struct sock *sk = skb->sk;
1880
1881                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1882                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1883                         skb->destructor = sock_efree;
1884                 }
1885         } else {
1886                 skb_orphan(skb);
1887         }
1888 }
1889 EXPORT_SYMBOL(skb_orphan_partial);
1890
1891 /*
1892  * Read buffer destructor automatically called from kfree_skb.
1893  */
1894 void sock_rfree(struct sk_buff *skb)
1895 {
1896         struct sock *sk = skb->sk;
1897         unsigned int len = skb->truesize;
1898
1899         atomic_sub(len, &sk->sk_rmem_alloc);
1900         sk_mem_uncharge(sk, len);
1901 }
1902 EXPORT_SYMBOL(sock_rfree);
1903
1904 /*
1905  * Buffer destructor for skbs that are not used directly in read or write
1906  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1907  */
1908 void sock_efree(struct sk_buff *skb)
1909 {
1910         sock_put(skb->sk);
1911 }
1912 EXPORT_SYMBOL(sock_efree);
1913
1914 kuid_t sock_i_uid(struct sock *sk)
1915 {
1916         kuid_t uid;
1917
1918         read_lock_bh(&sk->sk_callback_lock);
1919         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1920         read_unlock_bh(&sk->sk_callback_lock);
1921         return uid;
1922 }
1923 EXPORT_SYMBOL(sock_i_uid);
1924
1925 unsigned long sock_i_ino(struct sock *sk)
1926 {
1927         unsigned long ino;
1928
1929         read_lock_bh(&sk->sk_callback_lock);
1930         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1931         read_unlock_bh(&sk->sk_callback_lock);
1932         return ino;
1933 }
1934 EXPORT_SYMBOL(sock_i_ino);
1935
1936 /*
1937  * Allocate a skb from the socket's send buffer.
1938  */
1939 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1940                              gfp_t priority)
1941 {
1942         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1943                 struct sk_buff *skb = alloc_skb(size, priority);
1944                 if (skb) {
1945                         skb_set_owner_w(skb, sk);
1946                         return skb;
1947                 }
1948         }
1949         return NULL;
1950 }
1951 EXPORT_SYMBOL(sock_wmalloc);
1952
1953 static void sock_ofree(struct sk_buff *skb)
1954 {
1955         struct sock *sk = skb->sk;
1956
1957         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1958 }
1959
1960 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1961                              gfp_t priority)
1962 {
1963         struct sk_buff *skb;
1964
1965         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1966         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1967             sysctl_optmem_max)
1968                 return NULL;
1969
1970         skb = alloc_skb(size, priority);
1971         if (!skb)
1972                 return NULL;
1973
1974         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1975         skb->sk = sk;
1976         skb->destructor = sock_ofree;
1977         return skb;
1978 }
1979
1980 /*
1981  * Allocate a memory block from the socket's option memory buffer.
1982  */
1983 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1984 {
1985         if ((unsigned int)size <= sysctl_optmem_max &&
1986             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1987                 void *mem;
1988                 /* First do the add, to avoid the race if kmalloc
1989                  * might sleep.
1990                  */
1991                 atomic_add(size, &sk->sk_omem_alloc);
1992                 mem = kmalloc(size, priority);
1993                 if (mem)
1994                         return mem;
1995                 atomic_sub(size, &sk->sk_omem_alloc);
1996         }
1997         return NULL;
1998 }
1999 EXPORT_SYMBOL(sock_kmalloc);
2000
2001 /* Free an option memory block. Note, we actually want the inline
2002  * here as this allows gcc to detect the nullify and fold away the
2003  * condition entirely.
2004  */
2005 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2006                                   const bool nullify)
2007 {
2008         if (WARN_ON_ONCE(!mem))
2009                 return;
2010         if (nullify)
2011                 kzfree(mem);
2012         else
2013                 kfree(mem);
2014         atomic_sub(size, &sk->sk_omem_alloc);
2015 }
2016
2017 void sock_kfree_s(struct sock *sk, void *mem, int size)
2018 {
2019         __sock_kfree_s(sk, mem, size, false);
2020 }
2021 EXPORT_SYMBOL(sock_kfree_s);
2022
2023 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2024 {
2025         __sock_kfree_s(sk, mem, size, true);
2026 }
2027 EXPORT_SYMBOL(sock_kzfree_s);
2028
2029 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2030    I think, these locks should be removed for datagram sockets.
2031  */
2032 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2033 {
2034         DEFINE_WAIT(wait);
2035
2036         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2037         for (;;) {
2038                 if (!timeo)
2039                         break;
2040                 if (signal_pending(current))
2041                         break;
2042                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2043                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2044                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2045                         break;
2046                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2047                         break;
2048                 if (sk->sk_err)
2049                         break;
2050                 timeo = schedule_timeout(timeo);
2051         }
2052         finish_wait(sk_sleep(sk), &wait);
2053         return timeo;
2054 }
2055
2056
2057 /*
2058  *      Generic send/receive buffer handlers
2059  */
2060
2061 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2062                                      unsigned long data_len, int noblock,
2063                                      int *errcode, int max_page_order)
2064 {
2065         struct sk_buff *skb;
2066         long timeo;
2067         int err;
2068
2069         timeo = sock_sndtimeo(sk, noblock);
2070         for (;;) {
2071                 err = sock_error(sk);
2072                 if (err != 0)
2073                         goto failure;
2074
2075                 err = -EPIPE;
2076                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2077                         goto failure;
2078
2079                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2080                         break;
2081
2082                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2083                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2084                 err = -EAGAIN;
2085                 if (!timeo)
2086                         goto failure;
2087                 if (signal_pending(current))
2088                         goto interrupted;
2089                 timeo = sock_wait_for_wmem(sk, timeo);
2090         }
2091         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2092                                    errcode, sk->sk_allocation);
2093         if (skb)
2094                 skb_set_owner_w(skb, sk);
2095         return skb;
2096
2097 interrupted:
2098         err = sock_intr_errno(timeo);
2099 failure:
2100         *errcode = err;
2101         return NULL;
2102 }
2103 EXPORT_SYMBOL(sock_alloc_send_pskb);
2104
2105 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2106                                     int noblock, int *errcode)
2107 {
2108         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2109 }
2110 EXPORT_SYMBOL(sock_alloc_send_skb);
2111
2112 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2113                      struct sockcm_cookie *sockc)
2114 {
2115         u32 tsflags;
2116
2117         switch (cmsg->cmsg_type) {
2118         case SO_MARK:
2119                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2120                         return -EPERM;
2121                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2122                         return -EINVAL;
2123                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2124                 break;
2125         case SO_TIMESTAMPING:
2126                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2127                         return -EINVAL;
2128
2129                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2130                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2131                         return -EINVAL;
2132
2133                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2134                 sockc->tsflags |= tsflags;
2135                 break;
2136         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2137         case SCM_RIGHTS:
2138         case SCM_CREDENTIALS:
2139                 break;
2140         default:
2141                 return -EINVAL;
2142         }
2143         return 0;
2144 }
2145 EXPORT_SYMBOL(__sock_cmsg_send);
2146
2147 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2148                    struct sockcm_cookie *sockc)
2149 {
2150         struct cmsghdr *cmsg;
2151         int ret;
2152
2153         for_each_cmsghdr(cmsg, msg) {
2154                 if (!CMSG_OK(msg, cmsg))
2155                         return -EINVAL;
2156                 if (cmsg->cmsg_level != SOL_SOCKET)
2157                         continue;
2158                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2159                 if (ret)
2160                         return ret;
2161         }
2162         return 0;
2163 }
2164 EXPORT_SYMBOL(sock_cmsg_send);
2165
2166 static void sk_enter_memory_pressure(struct sock *sk)
2167 {
2168         if (!sk->sk_prot->enter_memory_pressure)
2169                 return;
2170
2171         sk->sk_prot->enter_memory_pressure(sk);
2172 }
2173
2174 static void sk_leave_memory_pressure(struct sock *sk)
2175 {
2176         if (sk->sk_prot->leave_memory_pressure) {
2177                 sk->sk_prot->leave_memory_pressure(sk);
2178         } else {
2179                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2180
2181                 if (memory_pressure && *memory_pressure)
2182                         *memory_pressure = 0;
2183         }
2184 }
2185
2186 /* On 32bit arches, an skb frag is limited to 2^15 */
2187 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2188
2189 /**
2190  * skb_page_frag_refill - check that a page_frag contains enough room
2191  * @sz: minimum size of the fragment we want to get
2192  * @pfrag: pointer to page_frag
2193  * @gfp: priority for memory allocation
2194  *
2195  * Note: While this allocator tries to use high order pages, there is
2196  * no guarantee that allocations succeed. Therefore, @sz MUST be
2197  * less or equal than PAGE_SIZE.
2198  */
2199 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2200 {
2201         if (pfrag->page) {
2202                 if (page_ref_count(pfrag->page) == 1) {
2203                         pfrag->offset = 0;
2204                         return true;
2205                 }
2206                 if (pfrag->offset + sz <= pfrag->size)
2207                         return true;
2208                 put_page(pfrag->page);
2209         }
2210
2211         pfrag->offset = 0;
2212         if (SKB_FRAG_PAGE_ORDER) {
2213                 /* Avoid direct reclaim but allow kswapd to wake */
2214                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2215                                           __GFP_COMP | __GFP_NOWARN |
2216                                           __GFP_NORETRY,
2217                                           SKB_FRAG_PAGE_ORDER);
2218                 if (likely(pfrag->page)) {
2219                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2220                         return true;
2221                 }
2222         }
2223         pfrag->page = alloc_page(gfp);
2224         if (likely(pfrag->page)) {
2225                 pfrag->size = PAGE_SIZE;
2226                 return true;
2227         }
2228         return false;
2229 }
2230 EXPORT_SYMBOL(skb_page_frag_refill);
2231
2232 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2233 {
2234         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2235                 return true;
2236
2237         sk_enter_memory_pressure(sk);
2238         sk_stream_moderate_sndbuf(sk);
2239         return false;
2240 }
2241 EXPORT_SYMBOL(sk_page_frag_refill);
2242
2243 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2244                 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2245                 int first_coalesce)
2246 {
2247         int sg_curr = *sg_curr_index, use = 0, rc = 0;
2248         unsigned int size = *sg_curr_size;
2249         struct page_frag *pfrag;
2250         struct scatterlist *sge;
2251
2252         len -= size;
2253         pfrag = sk_page_frag(sk);
2254
2255         while (len > 0) {
2256                 unsigned int orig_offset;
2257
2258                 if (!sk_page_frag_refill(sk, pfrag)) {
2259                         rc = -ENOMEM;
2260                         goto out;
2261                 }
2262
2263                 use = min_t(int, len, pfrag->size - pfrag->offset);
2264
2265                 if (!sk_wmem_schedule(sk, use)) {
2266                         rc = -ENOMEM;
2267                         goto out;
2268                 }
2269
2270                 sk_mem_charge(sk, use);
2271                 size += use;
2272                 orig_offset = pfrag->offset;
2273                 pfrag->offset += use;
2274
2275                 sge = sg + sg_curr - 1;
2276                 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2277                     sg->offset + sg->length == orig_offset) {
2278                         sg->length += use;
2279                 } else {
2280                         sge = sg + sg_curr;
2281                         sg_unmark_end(sge);
2282                         sg_set_page(sge, pfrag->page, use, orig_offset);
2283                         get_page(pfrag->page);
2284                         sg_curr++;
2285
2286                         if (sg_curr == MAX_SKB_FRAGS)
2287                                 sg_curr = 0;
2288
2289                         if (sg_curr == sg_start) {
2290                                 rc = -ENOSPC;
2291                                 break;
2292                         }
2293                 }
2294
2295                 len -= use;
2296         }
2297 out:
2298         *sg_curr_size = size;
2299         *sg_curr_index = sg_curr;
2300         return rc;
2301 }
2302 EXPORT_SYMBOL(sk_alloc_sg);
2303
2304 static void __lock_sock(struct sock *sk)
2305         __releases(&sk->sk_lock.slock)
2306         __acquires(&sk->sk_lock.slock)
2307 {
2308         DEFINE_WAIT(wait);
2309
2310         for (;;) {
2311                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2312                                         TASK_UNINTERRUPTIBLE);
2313                 spin_unlock_bh(&sk->sk_lock.slock);
2314                 schedule();
2315                 spin_lock_bh(&sk->sk_lock.slock);
2316                 if (!sock_owned_by_user(sk))
2317                         break;
2318         }
2319         finish_wait(&sk->sk_lock.wq, &wait);
2320 }
2321
2322 static void __release_sock(struct sock *sk)
2323         __releases(&sk->sk_lock.slock)
2324         __acquires(&sk->sk_lock.slock)
2325 {
2326         struct sk_buff *skb, *next;
2327
2328         while ((skb = sk->sk_backlog.head) != NULL) {
2329                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2330
2331                 spin_unlock_bh(&sk->sk_lock.slock);
2332
2333                 do {
2334                         next = skb->next;
2335                         prefetch(next);
2336                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2337                         skb->next = NULL;
2338                         sk_backlog_rcv(sk, skb);
2339
2340                         cond_resched();
2341
2342                         skb = next;
2343                 } while (skb != NULL);
2344
2345                 spin_lock_bh(&sk->sk_lock.slock);
2346         }
2347
2348         /*
2349          * Doing the zeroing here guarantee we can not loop forever
2350          * while a wild producer attempts to flood us.
2351          */
2352         sk->sk_backlog.len = 0;
2353 }
2354
2355 void __sk_flush_backlog(struct sock *sk)
2356 {
2357         spin_lock_bh(&sk->sk_lock.slock);
2358         __release_sock(sk);
2359         spin_unlock_bh(&sk->sk_lock.slock);
2360 }
2361
2362 /**
2363  * sk_wait_data - wait for data to arrive at sk_receive_queue
2364  * @sk:    sock to wait on
2365  * @timeo: for how long
2366  * @skb:   last skb seen on sk_receive_queue
2367  *
2368  * Now socket state including sk->sk_err is changed only under lock,
2369  * hence we may omit checks after joining wait queue.
2370  * We check receive queue before schedule() only as optimization;
2371  * it is very likely that release_sock() added new data.
2372  */
2373 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2374 {
2375         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2376         int rc;
2377
2378         add_wait_queue(sk_sleep(sk), &wait);
2379         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2380         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2381         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2382         remove_wait_queue(sk_sleep(sk), &wait);
2383         return rc;
2384 }
2385 EXPORT_SYMBOL(sk_wait_data);
2386
2387 /**
2388  *      __sk_mem_raise_allocated - increase memory_allocated
2389  *      @sk: socket
2390  *      @size: memory size to allocate
2391  *      @amt: pages to allocate
2392  *      @kind: allocation type
2393  *
2394  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2395  */
2396 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2397 {
2398         struct proto *prot = sk->sk_prot;
2399         long allocated = sk_memory_allocated_add(sk, amt);
2400
2401         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2402             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2403                 goto suppress_allocation;
2404
2405         /* Under limit. */
2406         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2407                 sk_leave_memory_pressure(sk);
2408                 return 1;
2409         }
2410
2411         /* Under pressure. */
2412         if (allocated > sk_prot_mem_limits(sk, 1))
2413                 sk_enter_memory_pressure(sk);
2414
2415         /* Over hard limit. */
2416         if (allocated > sk_prot_mem_limits(sk, 2))
2417                 goto suppress_allocation;
2418
2419         /* guarantee minimum buffer size under pressure */
2420         if (kind == SK_MEM_RECV) {
2421                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2422                         return 1;
2423
2424         } else { /* SK_MEM_SEND */
2425                 int wmem0 = sk_get_wmem0(sk, prot);
2426
2427                 if (sk->sk_type == SOCK_STREAM) {
2428                         if (sk->sk_wmem_queued < wmem0)
2429                                 return 1;
2430                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2431                                 return 1;
2432                 }
2433         }
2434
2435         if (sk_has_memory_pressure(sk)) {
2436                 int alloc;
2437
2438                 if (!sk_under_memory_pressure(sk))
2439                         return 1;
2440                 alloc = sk_sockets_allocated_read_positive(sk);
2441                 if (sk_prot_mem_limits(sk, 2) > alloc *
2442                     sk_mem_pages(sk->sk_wmem_queued +
2443                                  atomic_read(&sk->sk_rmem_alloc) +
2444                                  sk->sk_forward_alloc))
2445                         return 1;
2446         }
2447
2448 suppress_allocation:
2449
2450         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2451                 sk_stream_moderate_sndbuf(sk);
2452
2453                 /* Fail only if socket is _under_ its sndbuf.
2454                  * In this case we cannot block, so that we have to fail.
2455                  */
2456                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2457                         return 1;
2458         }
2459
2460         trace_sock_exceed_buf_limit(sk, prot, allocated);
2461
2462         sk_memory_allocated_sub(sk, amt);
2463
2464         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2465                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2466
2467         return 0;
2468 }
2469 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2470
2471 /**
2472  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2473  *      @sk: socket
2474  *      @size: memory size to allocate
2475  *      @kind: allocation type
2476  *
2477  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2478  *      rmem allocation. This function assumes that protocols which have
2479  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2480  */
2481 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2482 {
2483         int ret, amt = sk_mem_pages(size);
2484
2485         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2486         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2487         if (!ret)
2488                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2489         return ret;
2490 }
2491 EXPORT_SYMBOL(__sk_mem_schedule);
2492
2493 /**
2494  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2495  *      @sk: socket
2496  *      @amount: number of quanta
2497  *
2498  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2499  */
2500 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2501 {
2502         sk_memory_allocated_sub(sk, amount);
2503
2504         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2505                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2506
2507         if (sk_under_memory_pressure(sk) &&
2508             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2509                 sk_leave_memory_pressure(sk);
2510 }
2511 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2512
2513 /**
2514  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2515  *      @sk: socket
2516  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2517  */
2518 void __sk_mem_reclaim(struct sock *sk, int amount)
2519 {
2520         amount >>= SK_MEM_QUANTUM_SHIFT;
2521         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2522         __sk_mem_reduce_allocated(sk, amount);
2523 }
2524 EXPORT_SYMBOL(__sk_mem_reclaim);
2525
2526 int sk_set_peek_off(struct sock *sk, int val)
2527 {
2528         sk->sk_peek_off = val;
2529         return 0;
2530 }
2531 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2532
2533 /*
2534  * Set of default routines for initialising struct proto_ops when
2535  * the protocol does not support a particular function. In certain
2536  * cases where it makes no sense for a protocol to have a "do nothing"
2537  * function, some default processing is provided.
2538  */
2539
2540 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2541 {
2542         return -EOPNOTSUPP;
2543 }
2544 EXPORT_SYMBOL(sock_no_bind);
2545
2546 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2547                     int len, int flags)
2548 {
2549         return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_connect);
2552
2553 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2554 {
2555         return -EOPNOTSUPP;
2556 }
2557 EXPORT_SYMBOL(sock_no_socketpair);
2558
2559 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2560                    bool kern)
2561 {
2562         return -EOPNOTSUPP;
2563 }
2564 EXPORT_SYMBOL(sock_no_accept);
2565
2566 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2567                     int peer)
2568 {
2569         return -EOPNOTSUPP;
2570 }
2571 EXPORT_SYMBOL(sock_no_getname);
2572
2573 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2574 {
2575         return 0;
2576 }
2577 EXPORT_SYMBOL(sock_no_poll);
2578
2579 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2580 {
2581         return -EOPNOTSUPP;
2582 }
2583 EXPORT_SYMBOL(sock_no_ioctl);
2584
2585 int sock_no_listen(struct socket *sock, int backlog)
2586 {
2587         return -EOPNOTSUPP;
2588 }
2589 EXPORT_SYMBOL(sock_no_listen);
2590
2591 int sock_no_shutdown(struct socket *sock, int how)
2592 {
2593         return -EOPNOTSUPP;
2594 }
2595 EXPORT_SYMBOL(sock_no_shutdown);
2596
2597 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2598                     char __user *optval, unsigned int optlen)
2599 {
2600         return -EOPNOTSUPP;
2601 }
2602 EXPORT_SYMBOL(sock_no_setsockopt);
2603
2604 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2605                     char __user *optval, int __user *optlen)
2606 {
2607         return -EOPNOTSUPP;
2608 }
2609 EXPORT_SYMBOL(sock_no_getsockopt);
2610
2611 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2612 {
2613         return -EOPNOTSUPP;
2614 }
2615 EXPORT_SYMBOL(sock_no_sendmsg);
2616
2617 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2618 {
2619         return -EOPNOTSUPP;
2620 }
2621 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2622
2623 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2624                     int flags)
2625 {
2626         return -EOPNOTSUPP;
2627 }
2628 EXPORT_SYMBOL(sock_no_recvmsg);
2629
2630 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2631 {
2632         /* Mirror missing mmap method error code */
2633         return -ENODEV;
2634 }
2635 EXPORT_SYMBOL(sock_no_mmap);
2636
2637 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2638 {
2639         ssize_t res;
2640         struct msghdr msg = {.msg_flags = flags};
2641         struct kvec iov;
2642         char *kaddr = kmap(page);
2643         iov.iov_base = kaddr + offset;
2644         iov.iov_len = size;
2645         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2646         kunmap(page);
2647         return res;
2648 }
2649 EXPORT_SYMBOL(sock_no_sendpage);
2650
2651 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2652                                 int offset, size_t size, int flags)
2653 {
2654         ssize_t res;
2655         struct msghdr msg = {.msg_flags = flags};
2656         struct kvec iov;
2657         char *kaddr = kmap(page);
2658
2659         iov.iov_base = kaddr + offset;
2660         iov.iov_len = size;
2661         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2662         kunmap(page);
2663         return res;
2664 }
2665 EXPORT_SYMBOL(sock_no_sendpage_locked);
2666
2667 /*
2668  *      Default Socket Callbacks
2669  */
2670
2671 static void sock_def_wakeup(struct sock *sk)
2672 {
2673         struct socket_wq *wq;
2674
2675         rcu_read_lock();
2676         wq = rcu_dereference(sk->sk_wq);
2677         if (skwq_has_sleeper(wq))
2678                 wake_up_interruptible_all(&wq->wait);
2679         rcu_read_unlock();
2680 }
2681
2682 static void sock_def_error_report(struct sock *sk)
2683 {
2684         struct socket_wq *wq;
2685
2686         rcu_read_lock();
2687         wq = rcu_dereference(sk->sk_wq);
2688         if (skwq_has_sleeper(wq))
2689                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2690         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2691         rcu_read_unlock();
2692 }
2693
2694 static void sock_def_readable(struct sock *sk)
2695 {
2696         struct socket_wq *wq;
2697
2698         rcu_read_lock();
2699         wq = rcu_dereference(sk->sk_wq);
2700         if (skwq_has_sleeper(wq))
2701                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2702                                                 EPOLLRDNORM | EPOLLRDBAND);
2703         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2704         rcu_read_unlock();
2705 }
2706
2707 static void sock_def_write_space(struct sock *sk)
2708 {
2709         struct socket_wq *wq;
2710
2711         rcu_read_lock();
2712
2713         /* Do not wake up a writer until he can make "significant"
2714          * progress.  --DaveM
2715          */
2716         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2717                 wq = rcu_dereference(sk->sk_wq);
2718                 if (skwq_has_sleeper(wq))
2719                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2720                                                 EPOLLWRNORM | EPOLLWRBAND);
2721
2722                 /* Should agree with poll, otherwise some programs break */
2723                 if (sock_writeable(sk))
2724                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2725         }
2726
2727         rcu_read_unlock();
2728 }
2729
2730 static void sock_def_destruct(struct sock *sk)
2731 {
2732 }
2733
2734 void sk_send_sigurg(struct sock *sk)
2735 {
2736         if (sk->sk_socket && sk->sk_socket->file)
2737                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2738                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2739 }
2740 EXPORT_SYMBOL(sk_send_sigurg);
2741
2742 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2743                     unsigned long expires)
2744 {
2745         if (!mod_timer(timer, expires))
2746                 sock_hold(sk);
2747 }
2748 EXPORT_SYMBOL(sk_reset_timer);
2749
2750 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2751 {
2752         if (del_timer(timer))
2753                 __sock_put(sk);
2754 }
2755 EXPORT_SYMBOL(sk_stop_timer);
2756
2757 void sock_init_data(struct socket *sock, struct sock *sk)
2758 {
2759         sk_init_common(sk);
2760         sk->sk_send_head        =       NULL;
2761
2762         timer_setup(&sk->sk_timer, NULL, 0);
2763
2764         sk->sk_allocation       =       GFP_KERNEL;
2765         sk->sk_rcvbuf           =       sysctl_rmem_default;
2766         sk->sk_sndbuf           =       sysctl_wmem_default;
2767         sk->sk_state            =       TCP_CLOSE;
2768         sk_set_socket(sk, sock);
2769
2770         sock_set_flag(sk, SOCK_ZAPPED);
2771
2772         if (sock) {
2773                 sk->sk_type     =       sock->type;
2774                 sk->sk_wq       =       sock->wq;
2775                 sock->sk        =       sk;
2776                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2777         } else {
2778                 sk->sk_wq       =       NULL;
2779                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2780         }
2781
2782         rwlock_init(&sk->sk_callback_lock);
2783         if (sk->sk_kern_sock)
2784                 lockdep_set_class_and_name(
2785                         &sk->sk_callback_lock,
2786                         af_kern_callback_keys + sk->sk_family,
2787                         af_family_kern_clock_key_strings[sk->sk_family]);
2788         else
2789                 lockdep_set_class_and_name(
2790                         &sk->sk_callback_lock,
2791                         af_callback_keys + sk->sk_family,
2792                         af_family_clock_key_strings[sk->sk_family]);
2793
2794         sk->sk_state_change     =       sock_def_wakeup;
2795         sk->sk_data_ready       =       sock_def_readable;
2796         sk->sk_write_space      =       sock_def_write_space;
2797         sk->sk_error_report     =       sock_def_error_report;
2798         sk->sk_destruct         =       sock_def_destruct;
2799
2800         sk->sk_frag.page        =       NULL;
2801         sk->sk_frag.offset      =       0;
2802         sk->sk_peek_off         =       -1;
2803
2804         sk->sk_peer_pid         =       NULL;
2805         sk->sk_peer_cred        =       NULL;
2806         sk->sk_write_pending    =       0;
2807         sk->sk_rcvlowat         =       1;
2808         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2809         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2810
2811         sk->sk_stamp = SK_DEFAULT_STAMP;
2812         atomic_set(&sk->sk_zckey, 0);
2813
2814 #ifdef CONFIG_NET_RX_BUSY_POLL
2815         sk->sk_napi_id          =       0;
2816         sk->sk_ll_usec          =       sysctl_net_busy_read;
2817 #endif
2818
2819         sk->sk_max_pacing_rate = ~0U;
2820         sk->sk_pacing_rate = ~0U;
2821         sk->sk_pacing_shift = 10;
2822         sk->sk_incoming_cpu = -1;
2823         /*
2824          * Before updating sk_refcnt, we must commit prior changes to memory
2825          * (Documentation/RCU/rculist_nulls.txt for details)
2826          */
2827         smp_wmb();
2828         refcount_set(&sk->sk_refcnt, 1);
2829         atomic_set(&sk->sk_drops, 0);
2830 }
2831 EXPORT_SYMBOL(sock_init_data);
2832
2833 void lock_sock_nested(struct sock *sk, int subclass)
2834 {
2835         might_sleep();
2836         spin_lock_bh(&sk->sk_lock.slock);
2837         if (sk->sk_lock.owned)
2838                 __lock_sock(sk);
2839         sk->sk_lock.owned = 1;
2840         spin_unlock(&sk->sk_lock.slock);
2841         /*
2842          * The sk_lock has mutex_lock() semantics here:
2843          */
2844         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2845         local_bh_enable();
2846 }
2847 EXPORT_SYMBOL(lock_sock_nested);
2848
2849 void release_sock(struct sock *sk)
2850 {
2851         spin_lock_bh(&sk->sk_lock.slock);
2852         if (sk->sk_backlog.tail)
2853                 __release_sock(sk);
2854
2855         /* Warning : release_cb() might need to release sk ownership,
2856          * ie call sock_release_ownership(sk) before us.
2857          */
2858         if (sk->sk_prot->release_cb)
2859                 sk->sk_prot->release_cb(sk);
2860
2861         sock_release_ownership(sk);
2862         if (waitqueue_active(&sk->sk_lock.wq))
2863                 wake_up(&sk->sk_lock.wq);
2864         spin_unlock_bh(&sk->sk_lock.slock);
2865 }
2866 EXPORT_SYMBOL(release_sock);
2867
2868 /**
2869  * lock_sock_fast - fast version of lock_sock
2870  * @sk: socket
2871  *
2872  * This version should be used for very small section, where process wont block
2873  * return false if fast path is taken:
2874  *
2875  *   sk_lock.slock locked, owned = 0, BH disabled
2876  *
2877  * return true if slow path is taken:
2878  *
2879  *   sk_lock.slock unlocked, owned = 1, BH enabled
2880  */
2881 bool lock_sock_fast(struct sock *sk)
2882 {
2883         might_sleep();
2884         spin_lock_bh(&sk->sk_lock.slock);
2885
2886         if (!sk->sk_lock.owned)
2887                 /*
2888                  * Note : We must disable BH
2889                  */
2890                 return false;
2891
2892         __lock_sock(sk);
2893         sk->sk_lock.owned = 1;
2894         spin_unlock(&sk->sk_lock.slock);
2895         /*
2896          * The sk_lock has mutex_lock() semantics here:
2897          */
2898         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2899         local_bh_enable();
2900         return true;
2901 }
2902 EXPORT_SYMBOL(lock_sock_fast);
2903
2904 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2905 {
2906         struct timeval tv;
2907         if (!sock_flag(sk, SOCK_TIMESTAMP))
2908                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2909         tv = ktime_to_timeval(sk->sk_stamp);
2910         if (tv.tv_sec == -1)
2911                 return -ENOENT;
2912         if (tv.tv_sec == 0) {
2913                 sk->sk_stamp = ktime_get_real();
2914                 tv = ktime_to_timeval(sk->sk_stamp);
2915         }
2916         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2917 }
2918 EXPORT_SYMBOL(sock_get_timestamp);
2919
2920 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2921 {
2922         struct timespec ts;
2923         if (!sock_flag(sk, SOCK_TIMESTAMP))
2924                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2925         ts = ktime_to_timespec(sk->sk_stamp);
2926         if (ts.tv_sec == -1)
2927                 return -ENOENT;
2928         if (ts.tv_sec == 0) {
2929                 sk->sk_stamp = ktime_get_real();
2930                 ts = ktime_to_timespec(sk->sk_stamp);
2931         }
2932         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2933 }
2934 EXPORT_SYMBOL(sock_get_timestampns);
2935
2936 void sock_enable_timestamp(struct sock *sk, int flag)
2937 {
2938         if (!sock_flag(sk, flag)) {
2939                 unsigned long previous_flags = sk->sk_flags;
2940
2941                 sock_set_flag(sk, flag);
2942                 /*
2943                  * we just set one of the two flags which require net
2944                  * time stamping, but time stamping might have been on
2945                  * already because of the other one
2946                  */
2947                 if (sock_needs_netstamp(sk) &&
2948                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2949                         net_enable_timestamp();
2950         }
2951 }
2952
2953 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2954                        int level, int type)
2955 {
2956         struct sock_exterr_skb *serr;
2957         struct sk_buff *skb;
2958         int copied, err;
2959
2960         err = -EAGAIN;
2961         skb = sock_dequeue_err_skb(sk);
2962         if (skb == NULL)
2963                 goto out;
2964
2965         copied = skb->len;
2966         if (copied > len) {
2967                 msg->msg_flags |= MSG_TRUNC;
2968                 copied = len;
2969         }
2970         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2971         if (err)
2972                 goto out_free_skb;
2973
2974         sock_recv_timestamp(msg, sk, skb);
2975
2976         serr = SKB_EXT_ERR(skb);
2977         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2978
2979         msg->msg_flags |= MSG_ERRQUEUE;
2980         err = copied;
2981
2982 out_free_skb:
2983         kfree_skb(skb);
2984 out:
2985         return err;
2986 }
2987 EXPORT_SYMBOL(sock_recv_errqueue);
2988
2989 /*
2990  *      Get a socket option on an socket.
2991  *
2992  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2993  *      asynchronous errors should be reported by getsockopt. We assume
2994  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2995  */
2996 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2997                            char __user *optval, int __user *optlen)
2998 {
2999         struct sock *sk = sock->sk;
3000
3001         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3002 }
3003 EXPORT_SYMBOL(sock_common_getsockopt);
3004
3005 #ifdef CONFIG_COMPAT
3006 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3007                                   char __user *optval, int __user *optlen)
3008 {
3009         struct sock *sk = sock->sk;
3010
3011         if (sk->sk_prot->compat_getsockopt != NULL)
3012                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3013                                                       optval, optlen);
3014         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3015 }
3016 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3017 #endif
3018
3019 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3020                         int flags)
3021 {
3022         struct sock *sk = sock->sk;
3023         int addr_len = 0;
3024         int err;
3025
3026         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3027                                    flags & ~MSG_DONTWAIT, &addr_len);
3028         if (err >= 0)
3029                 msg->msg_namelen = addr_len;
3030         return err;
3031 }
3032 EXPORT_SYMBOL(sock_common_recvmsg);
3033
3034 /*
3035  *      Set socket options on an inet socket.
3036  */
3037 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3038                            char __user *optval, unsigned int optlen)
3039 {
3040         struct sock *sk = sock->sk;
3041
3042         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3043 }
3044 EXPORT_SYMBOL(sock_common_setsockopt);
3045
3046 #ifdef CONFIG_COMPAT
3047 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3048                                   char __user *optval, unsigned int optlen)
3049 {
3050         struct sock *sk = sock->sk;
3051
3052         if (sk->sk_prot->compat_setsockopt != NULL)
3053                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3054                                                       optval, optlen);
3055         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3056 }
3057 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3058 #endif
3059
3060 void sk_common_release(struct sock *sk)
3061 {
3062         if (sk->sk_prot->destroy)
3063                 sk->sk_prot->destroy(sk);
3064
3065         /*
3066          * Observation: when sock_common_release is called, processes have
3067          * no access to socket. But net still has.
3068          * Step one, detach it from networking:
3069          *
3070          * A. Remove from hash tables.
3071          */
3072
3073         sk->sk_prot->unhash(sk);
3074
3075         /*
3076          * In this point socket cannot receive new packets, but it is possible
3077          * that some packets are in flight because some CPU runs receiver and
3078          * did hash table lookup before we unhashed socket. They will achieve
3079          * receive queue and will be purged by socket destructor.
3080          *
3081          * Also we still have packets pending on receive queue and probably,
3082          * our own packets waiting in device queues. sock_destroy will drain
3083          * receive queue, but transmitted packets will delay socket destruction
3084          * until the last reference will be released.
3085          */
3086
3087         sock_orphan(sk);
3088
3089         xfrm_sk_free_policy(sk);
3090
3091         sk_refcnt_debug_release(sk);
3092
3093         sock_put(sk);
3094 }
3095 EXPORT_SYMBOL(sk_common_release);
3096
3097 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3098 {
3099         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3100
3101         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3102         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3103         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3104         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3105         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3106         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3107         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3108         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3109         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3110 }
3111
3112 #ifdef CONFIG_PROC_FS
3113 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3114 struct prot_inuse {
3115         int val[PROTO_INUSE_NR];
3116 };
3117
3118 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3119
3120 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3121 {
3122         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3123 }
3124 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3125
3126 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3127 {
3128         int cpu, idx = prot->inuse_idx;
3129         int res = 0;
3130
3131         for_each_possible_cpu(cpu)
3132                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3133
3134         return res >= 0 ? res : 0;
3135 }
3136 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3137
3138 static void sock_inuse_add(struct net *net, int val)
3139 {
3140         this_cpu_add(*net->core.sock_inuse, val);
3141 }
3142
3143 int sock_inuse_get(struct net *net)
3144 {
3145         int cpu, res = 0;
3146
3147         for_each_possible_cpu(cpu)
3148                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3149
3150         return res;
3151 }
3152
3153 EXPORT_SYMBOL_GPL(sock_inuse_get);
3154
3155 static int __net_init sock_inuse_init_net(struct net *net)
3156 {
3157         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3158         if (net->core.prot_inuse == NULL)
3159                 return -ENOMEM;
3160
3161         net->core.sock_inuse = alloc_percpu(int);
3162         if (net->core.sock_inuse == NULL)
3163                 goto out;
3164
3165         return 0;
3166
3167 out:
3168         free_percpu(net->core.prot_inuse);
3169         return -ENOMEM;
3170 }
3171
3172 static void __net_exit sock_inuse_exit_net(struct net *net)
3173 {
3174         free_percpu(net->core.prot_inuse);
3175         free_percpu(net->core.sock_inuse);
3176 }
3177
3178 static struct pernet_operations net_inuse_ops = {
3179         .init = sock_inuse_init_net,
3180         .exit = sock_inuse_exit_net,
3181 };
3182
3183 static __init int net_inuse_init(void)
3184 {
3185         if (register_pernet_subsys(&net_inuse_ops))
3186                 panic("Cannot initialize net inuse counters");
3187
3188         return 0;
3189 }
3190
3191 core_initcall(net_inuse_init);
3192
3193 static void assign_proto_idx(struct proto *prot)
3194 {
3195         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3196
3197         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3198                 pr_err("PROTO_INUSE_NR exhausted\n");
3199                 return;
3200         }
3201
3202         set_bit(prot->inuse_idx, proto_inuse_idx);
3203 }
3204
3205 static void release_proto_idx(struct proto *prot)
3206 {
3207         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3208                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3209 }
3210 #else
3211 static inline void assign_proto_idx(struct proto *prot)
3212 {
3213 }
3214
3215 static inline void release_proto_idx(struct proto *prot)
3216 {
3217 }
3218
3219 static void sock_inuse_add(struct net *net, int val)
3220 {
3221 }
3222 #endif
3223
3224 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3225 {
3226         if (!rsk_prot)
3227                 return;
3228         kfree(rsk_prot->slab_name);
3229         rsk_prot->slab_name = NULL;
3230         kmem_cache_destroy(rsk_prot->slab);
3231         rsk_prot->slab = NULL;
3232 }
3233
3234 static int req_prot_init(const struct proto *prot)
3235 {
3236         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3237
3238         if (!rsk_prot)
3239                 return 0;
3240
3241         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3242                                         prot->name);
3243         if (!rsk_prot->slab_name)
3244                 return -ENOMEM;
3245
3246         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3247                                            rsk_prot->obj_size, 0,
3248                                            prot->slab_flags, NULL);
3249
3250         if (!rsk_prot->slab) {
3251                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3252                         prot->name);
3253                 return -ENOMEM;
3254         }
3255         return 0;
3256 }
3257
3258 int proto_register(struct proto *prot, int alloc_slab)
3259 {
3260         if (alloc_slab) {
3261                 prot->slab = kmem_cache_create_usercopy(prot->name,
3262                                         prot->obj_size, 0,
3263                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3264                                         prot->useroffset, prot->usersize,
3265                                         NULL);
3266
3267                 if (prot->slab == NULL) {
3268                         pr_crit("%s: Can't create sock SLAB cache!\n",
3269                                 prot->name);
3270                         goto out;
3271                 }
3272
3273                 if (req_prot_init(prot))
3274                         goto out_free_request_sock_slab;
3275
3276                 if (prot->twsk_prot != NULL) {
3277                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3278
3279                         if (prot->twsk_prot->twsk_slab_name == NULL)
3280                                 goto out_free_request_sock_slab;
3281
3282                         prot->twsk_prot->twsk_slab =
3283                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3284                                                   prot->twsk_prot->twsk_obj_size,
3285                                                   0,
3286                                                   prot->slab_flags,
3287                                                   NULL);
3288                         if (prot->twsk_prot->twsk_slab == NULL)
3289                                 goto out_free_timewait_sock_slab_name;
3290                 }
3291         }
3292
3293         mutex_lock(&proto_list_mutex);
3294         list_add(&prot->node, &proto_list);
3295         assign_proto_idx(prot);
3296         mutex_unlock(&proto_list_mutex);
3297         return 0;
3298
3299 out_free_timewait_sock_slab_name:
3300         kfree(prot->twsk_prot->twsk_slab_name);
3301 out_free_request_sock_slab:
3302         req_prot_cleanup(prot->rsk_prot);
3303
3304         kmem_cache_destroy(prot->slab);
3305         prot->slab = NULL;
3306 out:
3307         return -ENOBUFS;
3308 }
3309 EXPORT_SYMBOL(proto_register);
3310
3311 void proto_unregister(struct proto *prot)
3312 {
3313         mutex_lock(&proto_list_mutex);
3314         release_proto_idx(prot);
3315         list_del(&prot->node);
3316         mutex_unlock(&proto_list_mutex);
3317
3318         kmem_cache_destroy(prot->slab);
3319         prot->slab = NULL;
3320
3321         req_prot_cleanup(prot->rsk_prot);
3322
3323         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3324                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3325                 kfree(prot->twsk_prot->twsk_slab_name);
3326                 prot->twsk_prot->twsk_slab = NULL;
3327         }
3328 }
3329 EXPORT_SYMBOL(proto_unregister);
3330
3331 int sock_load_diag_module(int family, int protocol)
3332 {
3333         if (!protocol) {
3334                 if (!sock_is_registered(family))
3335                         return -ENOENT;
3336
3337                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3338                                       NETLINK_SOCK_DIAG, family);
3339         }
3340
3341 #ifdef CONFIG_INET
3342         if (family == AF_INET &&
3343             !rcu_access_pointer(inet_protos[protocol]))
3344                 return -ENOENT;
3345 #endif
3346
3347         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3348                               NETLINK_SOCK_DIAG, family, protocol);
3349 }
3350 EXPORT_SYMBOL(sock_load_diag_module);
3351
3352 #ifdef CONFIG_PROC_FS
3353 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3354         __acquires(proto_list_mutex)
3355 {
3356         mutex_lock(&proto_list_mutex);
3357         return seq_list_start_head(&proto_list, *pos);
3358 }
3359
3360 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3361 {
3362         return seq_list_next(v, &proto_list, pos);
3363 }
3364
3365 static void proto_seq_stop(struct seq_file *seq, void *v)
3366         __releases(proto_list_mutex)
3367 {
3368         mutex_unlock(&proto_list_mutex);
3369 }
3370
3371 static char proto_method_implemented(const void *method)
3372 {
3373         return method == NULL ? 'n' : 'y';
3374 }
3375 static long sock_prot_memory_allocated(struct proto *proto)
3376 {
3377         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3378 }
3379
3380 static char *sock_prot_memory_pressure(struct proto *proto)
3381 {
3382         return proto->memory_pressure != NULL ?
3383         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3384 }
3385
3386 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3387 {
3388
3389         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3390                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3391                    proto->name,
3392                    proto->obj_size,
3393                    sock_prot_inuse_get(seq_file_net(seq), proto),
3394                    sock_prot_memory_allocated(proto),
3395                    sock_prot_memory_pressure(proto),
3396                    proto->max_header,
3397                    proto->slab == NULL ? "no" : "yes",
3398                    module_name(proto->owner),
3399                    proto_method_implemented(proto->close),
3400                    proto_method_implemented(proto->connect),
3401                    proto_method_implemented(proto->disconnect),
3402                    proto_method_implemented(proto->accept),
3403                    proto_method_implemented(proto->ioctl),
3404                    proto_method_implemented(proto->init),
3405                    proto_method_implemented(proto->destroy),
3406                    proto_method_implemented(proto->shutdown),
3407                    proto_method_implemented(proto->setsockopt),
3408                    proto_method_implemented(proto->getsockopt),
3409                    proto_method_implemented(proto->sendmsg),
3410                    proto_method_implemented(proto->recvmsg),
3411                    proto_method_implemented(proto->sendpage),
3412                    proto_method_implemented(proto->bind),
3413                    proto_method_implemented(proto->backlog_rcv),
3414                    proto_method_implemented(proto->hash),
3415                    proto_method_implemented(proto->unhash),
3416                    proto_method_implemented(proto->get_port),
3417                    proto_method_implemented(proto->enter_memory_pressure));
3418 }
3419
3420 static int proto_seq_show(struct seq_file *seq, void *v)
3421 {
3422         if (v == &proto_list)
3423                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3424                            "protocol",
3425                            "size",
3426                            "sockets",
3427                            "memory",
3428                            "press",
3429                            "maxhdr",
3430                            "slab",
3431                            "module",
3432                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3433         else
3434                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3435         return 0;
3436 }
3437
3438 static const struct seq_operations proto_seq_ops = {
3439         .start  = proto_seq_start,
3440         .next   = proto_seq_next,
3441         .stop   = proto_seq_stop,
3442         .show   = proto_seq_show,
3443 };
3444
3445 static int proto_seq_open(struct inode *inode, struct file *file)
3446 {
3447         return seq_open_net(inode, file, &proto_seq_ops,
3448                             sizeof(struct seq_net_private));
3449 }
3450
3451 static const struct file_operations proto_seq_fops = {
3452         .open           = proto_seq_open,
3453         .read           = seq_read,
3454         .llseek         = seq_lseek,
3455         .release        = seq_release_net,
3456 };
3457
3458 static __net_init int proto_init_net(struct net *net)
3459 {
3460         if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
3461                 return -ENOMEM;
3462
3463         return 0;
3464 }
3465
3466 static __net_exit void proto_exit_net(struct net *net)
3467 {
3468         remove_proc_entry("protocols", net->proc_net);
3469 }
3470
3471
3472 static __net_initdata struct pernet_operations proto_net_ops = {
3473         .init = proto_init_net,
3474         .exit = proto_exit_net,
3475 };
3476
3477 static int __init proto_init(void)
3478 {
3479         return register_pernet_subsys(&proto_net_ops);
3480 }
3481
3482 subsys_initcall(proto_init);
3483
3484 #endif /* PROC_FS */
3485
3486 #ifdef CONFIG_NET_RX_BUSY_POLL
3487 bool sk_busy_loop_end(void *p, unsigned long start_time)
3488 {
3489         struct sock *sk = p;
3490
3491         return !skb_queue_empty(&sk->sk_receive_queue) ||
3492                sk_busy_loop_timeout(sk, start_time);
3493 }
3494 EXPORT_SYMBOL(sk_busy_loop_end);
3495 #endif /* CONFIG_NET_RX_BUSY_POLL */