net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <[email protected]>
  12  *              Florian La Roche, <[email protected]>
  13  *              Alan Cox, <[email protected]>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capability to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socket was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196 /*
 197  * Each address family might have different locking rules, so we have
 198  * one slock key per address family and separate keys for internal and
 199  * userspace sockets.
 200  */
 201 static struct lock_class_key af_family_keys[AF_MAX];
 202 static struct lock_class_key af_family_kern_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206 /*
 207  * Make lock validator output more readable. (we pre-construct these
 208  * strings build-time, so that runtime initialization of socket
 209  * locks is fast):
 210  */
 211
 212 #define _sock_locks(x)                                            \
 213   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 214   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 215   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 216   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 217   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 218   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 219   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 220   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 221   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 222   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 223   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 224   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 225   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 226   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 227   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 228
 229 static const char *const af_family_key_strings[AF_MAX+1] = {
 230         _sock_locks("sk_lock-")
 231 };
 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233         _sock_locks("slock-")
 234 };
 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236         _sock_locks("clock-")
 237 };
 238
 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240         _sock_locks("k-sk_lock-")
 241 };
 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-slock-")
 244 };
 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-clock-")
 247 };
 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264 };
 265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281 };
 282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock and sk queues locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305 static struct lock_class_key af_rlock_keys[AF_MAX];
 306 static struct lock_class_key af_wlock_keys[AF_MAX];
 307 static struct lock_class_key af_elock_keys[AF_MAX];
 308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310 /* Take into consideration the size of the struct sk_buff overhead in the
 311  * determination of these values, since that is non-constant across
 312  * platforms.  This makes socket queueing behavior and performance
 313  * not depend upon such differences.
 314  */
 315 #define _SK_MEM_PACKETS         256
 316 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 317 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 318 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 319
 320 /* Run time adjustable parameters. */
 321 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 322 EXPORT_SYMBOL(sysctl_wmem_max);
 323 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 324 EXPORT_SYMBOL(sysctl_rmem_max);
 325 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 326 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 327
 328 /* Maximal space eaten by iovec or ancillary data plus some space */
 329 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 330 EXPORT_SYMBOL(sysctl_optmem_max);
 331
 332 int sysctl_tstamp_allow_data __read_mostly = 1;
 333
 334 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 335 EXPORT_SYMBOL_GPL(memalloc_socks);
 336
 337 /**
 338  * sk_set_memalloc - sets %SOCK_MEMALLOC
 339  * @sk: socket to set it on
 340  *
 341  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 342  * It's the responsibility of the admin to adjust min_free_kbytes
 343  * to meet the requirements
 344  */
 345 void sk_set_memalloc(struct sock *sk)
 346 {
 347         sock_set_flag(sk, SOCK_MEMALLOC);
 348         sk->sk_allocation |= __GFP_MEMALLOC;
 349         static_key_slow_inc(&memalloc_socks);
 350 }
 351 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 352
 353 void sk_clear_memalloc(struct sock *sk)
 354 {
 355         sock_reset_flag(sk, SOCK_MEMALLOC);
 356         sk->sk_allocation &= ~__GFP_MEMALLOC;
 357         static_key_slow_dec(&memalloc_socks);
 358
 359         /*
 360          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 361          * progress of swapping. SOCK_MEMALLOC may be cleared while
 362          * it has rmem allocations due to the last swapfile being deactivated
 363          * but there is a risk that the socket is unusable due to exceeding
 364          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 365          */
 366         sk_mem_reclaim(sk);
 367 }
 368 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 369
 370 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 371 {
 372         int ret;
 373         unsigned int noreclaim_flag;
 374
 375         /* these should have been dropped before queueing */
 376         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 377
 378         noreclaim_flag = memalloc_noreclaim_save();
 379         ret = sk->sk_backlog_rcv(sk, skb);
 380         memalloc_noreclaim_restore(noreclaim_flag);
 381
 382         return ret;
 383 }
 384 EXPORT_SYMBOL(__sk_backlog_rcv);
 385
 386 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 387 {
 388         struct timeval tv;
 389
 390         if (optlen < sizeof(tv))
 391                 return -EINVAL;
 392         if (copy_from_user(&tv, optval, sizeof(tv)))
 393                 return -EFAULT;
 394         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                 return -EDOM;
 396
 397         if (tv.tv_sec < 0) {
 398                 static int warned __read_mostly;
 399
 400                 *timeo_p = 0;
 401                 if (warned < 10 && net_ratelimit()) {
 402                         warned++;
 403                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                 __func__, current->comm, task_pid_nr(current));
 405                 }
 406                 return 0;
 407         }
 408         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                 return 0;
 411         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 412                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 413         return 0;
 414 }
 415
 416 static void sock_warn_obsolete_bsdism(const char *name)
 417 {
 418         static int warned;
 419         static char warncomm[TASK_COMM_LEN];
 420         if (strcmp(warncomm, current->comm) && warned < 5) {
 421                 strcpy(warncomm,  current->comm);
 422                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 423                         warncomm, name);
 424                 warned++;
 425         }
 426 }
 427
 428 static bool sock_needs_netstamp(const struct sock *sk)
 429 {
 430         switch (sk->sk_family) {
 431         case AF_UNSPEC:
 432         case AF_UNIX:
 433                 return false;
 434         default:
 435                 return true;
 436         }
 437 }
 438
 439 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 440 {
 441         if (sk->sk_flags & flags) {
 442                 sk->sk_flags &= ~flags;
 443                 if (sock_needs_netstamp(sk) &&
 444                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 445                         net_disable_timestamp();
 446         }
 447 }
 448
 449
 450 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 451 {
 452         unsigned long flags;
 453         struct sk_buff_head *list = &sk->sk_receive_queue;
 454
 455         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 456                 atomic_inc(&sk->sk_drops);
 457                 trace_sock_rcvqueue_full(sk, skb);
 458                 return -ENOMEM;
 459         }
 460
 461         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 462                 atomic_inc(&sk->sk_drops);
 463                 return -ENOBUFS;
 464         }
 465
 466         skb->dev = NULL;
 467         skb_set_owner_r(skb, sk);
 468
 469         /* we escape from rcu protected region, make sure we dont leak
 470          * a norefcounted dst
 471          */
 472         skb_dst_force(skb);
 473
 474         spin_lock_irqsave(&list->lock, flags);
 475         sock_skb_set_dropcount(sk, skb);
 476         __skb_queue_tail(list, skb);
 477         spin_unlock_irqrestore(&list->lock, flags);
 478
 479         if (!sock_flag(sk, SOCK_DEAD))
 480                 sk->sk_data_ready(sk);
 481         return 0;
 482 }
 483 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 484
 485 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 486 {
 487         int err;
 488
 489         err = sk_filter(sk, skb);
 490         if (err)
 491                 return err;
 492
 493         return __sock_queue_rcv_skb(sk, skb);
 494 }
 495 EXPORT_SYMBOL(sock_queue_rcv_skb);
 496
 497 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 498                      const int nested, unsigned int trim_cap, bool refcounted)
 499 {
 500         int rc = NET_RX_SUCCESS;
 501
 502         if (sk_filter_trim_cap(sk, skb, trim_cap))
 503                 goto discard_and_relse;
 504
 505         skb->dev = NULL;
 506
 507         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 508                 atomic_inc(&sk->sk_drops);
 509                 goto discard_and_relse;
 510         }
 511         if (nested)
 512                 bh_lock_sock_nested(sk);
 513         else
 514                 bh_lock_sock(sk);
 515         if (!sock_owned_by_user(sk)) {
 516                 /*
 517                  * trylock + unlock semantics:
 518                  */
 519                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                 rc = sk_backlog_rcv(sk, skb);
 522
 523                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 524         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 525                 bh_unlock_sock(sk);
 526                 atomic_inc(&sk->sk_drops);
 527                 goto discard_and_relse;
 528         }
 529
 530         bh_unlock_sock(sk);
 531 out:
 532         if (refcounted)
 533                 sock_put(sk);
 534         return rc;
 535 discard_and_relse:
 536         kfree_skb(skb);
 537         goto out;
 538 }
 539 EXPORT_SYMBOL(__sk_receive_skb);
 540
 541 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 542 {
 543         struct dst_entry *dst = __sk_dst_get(sk);
 544
 545         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 546                 sk_tx_queue_clear(sk);
 547                 sk->sk_dst_pending_confirm = 0;
 548                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 549                 dst_release(dst);
 550                 return NULL;
 551         }
 552
 553         return dst;
 554 }
 555 EXPORT_SYMBOL(__sk_dst_check);
 556
 557 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 558 {
 559         struct dst_entry *dst = sk_dst_get(sk);
 560
 561         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 562                 sk_dst_reset(sk);
 563                 dst_release(dst);
 564                 return NULL;
 565         }
 566
 567         return dst;
 568 }
 569 EXPORT_SYMBOL(sk_dst_check);
 570
 571 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 572                                 int optlen)
 573 {
 574         int ret = -ENOPROTOOPT;
 575 #ifdef CONFIG_NETDEVICES
 576         struct net *net = sock_net(sk);
 577         char devname[IFNAMSIZ];
 578         int index;
 579
 580         /* Sorry... */
 581         ret = -EPERM;
 582         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 583                 goto out;
 584
 585         ret = -EINVAL;
 586         if (optlen < 0)
 587                 goto out;
 588
 589         /* Bind this socket to a particular device like "eth0",
 590          * as specified in the passed interface name. If the
 591          * name is "" or the option length is zero the socket
 592          * is not bound.
 593          */
 594         if (optlen > IFNAMSIZ - 1)
 595                 optlen = IFNAMSIZ - 1;
 596         memset(devname, 0, sizeof(devname));
 597
 598         ret = -EFAULT;
 599         if (copy_from_user(devname, optval, optlen))
 600                 goto out;
 601
 602         index = 0;
 603         if (devname[0] != '\0') {
 604                 struct net_device *dev;
 605
 606                 rcu_read_lock();
 607                 dev = dev_get_by_name_rcu(net, devname);
 608                 if (dev)
 609                         index = dev->ifindex;
 610                 rcu_read_unlock();
 611                 ret = -ENODEV;
 612                 if (!dev)
 613                         goto out;
 614         }
 615
 616         lock_sock(sk);
 617         sk->sk_bound_dev_if = index;
 618         sk_dst_reset(sk);
 619         release_sock(sk);
 620
 621         ret = 0;
 622
 623 out:
 624 #endif
 625
 626         return ret;
 627 }
 628
 629 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 630                                 int __user *optlen, int len)
 631 {
 632         int ret = -ENOPROTOOPT;
 633 #ifdef CONFIG_NETDEVICES
 634         struct net *net = sock_net(sk);
 635         char devname[IFNAMSIZ];
 636
 637         if (sk->sk_bound_dev_if == 0) {
 638                 len = 0;
 639                 goto zero;
 640         }
 641
 642         ret = -EINVAL;
 643         if (len < IFNAMSIZ)
 644                 goto out;
 645
 646         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 647         if (ret)
 648                 goto out;
 649
 650         len = strlen(devname) + 1;
 651
 652         ret = -EFAULT;
 653         if (copy_to_user(optval, devname, len))
 654                 goto out;
 655
 656 zero:
 657         ret = -EFAULT;
 658         if (put_user(len, optlen))
 659                 goto out;
 660
 661         ret = 0;
 662
 663 out:
 664 #endif
 665
 666         return ret;
 667 }
 668
 669 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 670 {
 671         if (valbool)
 672                 sock_set_flag(sk, bit);
 673         else
 674                 sock_reset_flag(sk, bit);
 675 }
 676
 677 bool sk_mc_loop(struct sock *sk)
 678 {
 679         if (dev_recursion_level())
 680                 return false;
 681         if (!sk)
 682                 return true;
 683         switch (sk->sk_family) {
 684         case AF_INET:
 685                 return inet_sk(sk)->mc_loop;
 686 #if IS_ENABLED(CONFIG_IPV6)
 687         case AF_INET6:
 688                 return inet6_sk(sk)->mc_loop;
 689 #endif
 690         }
 691         WARN_ON(1);
 692         return true;
 693 }
 694 EXPORT_SYMBOL(sk_mc_loop);
 695
 696 /*
 697  *      This is meant for all protocols to use and covers goings on
 698  *      at the socket level. Everything here is generic.
 699  */
 700
 701 int sock_setsockopt(struct socket *sock, int level, int optname,
 702                     char __user *optval, unsigned int optlen)
 703 {
 704         struct sock *sk = sock->sk;
 705         int val;
 706         int valbool;
 707         struct linger ling;
 708         int ret = 0;
 709
 710         /*
 711          *      Options without arguments
 712          */
 713
 714         if (optname == SO_BINDTODEVICE)
 715                 return sock_setbindtodevice(sk, optval, optlen);
 716
 717         if (optlen < sizeof(int))
 718                 return -EINVAL;
 719
 720         if (get_user(val, (int __user *)optval))
 721                 return -EFAULT;
 722
 723         valbool = val ? 1 : 0;
 724
 725         lock_sock(sk);
 726
 727         switch (optname) {
 728         case SO_DEBUG:
 729                 if (val && !capable(CAP_NET_ADMIN))
 730                         ret = -EACCES;
 731                 else
 732                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 733                 break;
 734         case SO_REUSEADDR:
 735                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 736                 break;
 737         case SO_REUSEPORT:
 738                 sk->sk_reuseport = valbool;
 739                 break;
 740         case SO_TYPE:
 741         case SO_PROTOCOL:
 742         case SO_DOMAIN:
 743         case SO_ERROR:
 744                 ret = -ENOPROTOOPT;
 745                 break;
 746         case SO_DONTROUTE:
 747                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 748                 break;
 749         case SO_BROADCAST:
 750                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 751                 break;
 752         case SO_SNDBUF:
 753                 /* Don't error on this BSD doesn't and if you think
 754                  * about it this is right. Otherwise apps have to
 755                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 756                  * are treated in BSD as hints
 757                  */
 758                 val = min_t(u32, val, sysctl_wmem_max);
 759 set_sndbuf:
 760                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 761                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 762                 /* Wake up sending tasks if we upped the value. */
 763                 sk->sk_write_space(sk);
 764                 break;
 765
 766         case SO_SNDBUFFORCE:
 767                 if (!capable(CAP_NET_ADMIN)) {
 768                         ret = -EPERM;
 769                         break;
 770                 }
 771                 goto set_sndbuf;
 772
 773         case SO_RCVBUF:
 774                 /* Don't error on this BSD doesn't and if you think
 775                  * about it this is right. Otherwise apps have to
 776                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777                  * are treated in BSD as hints
 778                  */
 779                 val = min_t(u32, val, sysctl_rmem_max);
 780 set_rcvbuf:
 781                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 782                 /*
 783                  * We double it on the way in to account for
 784                  * "struct sk_buff" etc. overhead.   Applications
 785                  * assume that the SO_RCVBUF setting they make will
 786                  * allow that much actual data to be received on that
 787                  * socket.
 788                  *
 789                  * Applications are unaware that "struct sk_buff" and
 790                  * other overheads allocate from the receive buffer
 791                  * during socket buffer allocation.
 792                  *
 793                  * And after considering the possible alternatives,
 794                  * returning the value we actually used in getsockopt
 795                  * is the most desirable behavior.
 796                  */
 797                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 798                 break;
 799
 800         case SO_RCVBUFFORCE:
 801                 if (!capable(CAP_NET_ADMIN)) {
 802                         ret = -EPERM;
 803                         break;
 804                 }
 805                 goto set_rcvbuf;
 806
 807         case SO_KEEPALIVE:
 808                 if (sk->sk_prot->keepalive)
 809                         sk->sk_prot->keepalive(sk, valbool);
 810                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 811                 break;
 812
 813         case SO_OOBINLINE:
 814                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 815                 break;
 816
 817         case SO_NO_CHECK:
 818                 sk->sk_no_check_tx = valbool;
 819                 break;
 820
 821         case SO_PRIORITY:
 822                 if ((val >= 0 && val <= 6) ||
 823                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 824                         sk->sk_priority = val;
 825                 else
 826                         ret = -EPERM;
 827                 break;
 828
 829         case SO_LINGER:
 830                 if (optlen < sizeof(ling)) {
 831                         ret = -EINVAL;  /* 1003.1g */
 832                         break;
 833                 }
 834                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 835                         ret = -EFAULT;
 836                         break;
 837                 }
 838                 if (!ling.l_onoff)
 839                         sock_reset_flag(sk, SOCK_LINGER);
 840                 else {
 841 #if (BITS_PER_LONG == 32)
 842                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 843                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 844                         else
 845 #endif
 846                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 847                         sock_set_flag(sk, SOCK_LINGER);
 848                 }
 849                 break;
 850
 851         case SO_BSDCOMPAT:
 852                 sock_warn_obsolete_bsdism("setsockopt");
 853                 break;
 854
 855         case SO_PASSCRED:
 856                 if (valbool)
 857                         set_bit(SOCK_PASSCRED, &sock->flags);
 858                 else
 859                         clear_bit(SOCK_PASSCRED, &sock->flags);
 860                 break;
 861
 862         case SO_TIMESTAMP:
 863         case SO_TIMESTAMPNS:
 864                 if (valbool)  {
 865                         if (optname == SO_TIMESTAMP)
 866                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 867                         else
 868                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 869                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 870                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 871                 } else {
 872                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 873                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 874                 }
 875                 break;
 876
 877         case SO_TIMESTAMPING:
 878                 if (val & ~SOF_TIMESTAMPING_MASK) {
 879                         ret = -EINVAL;
 880                         break;
 881                 }
 882
 883                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 884                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 885                         if (sk->sk_protocol == IPPROTO_TCP &&
 886                             sk->sk_type == SOCK_STREAM) {
 887                                 if ((1 << sk->sk_state) &
 888                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 889                                         ret = -EINVAL;
 890                                         break;
 891                                 }
 892                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 893                         } else {
 894                                 sk->sk_tskey = 0;
 895                         }
 896                 }
 897
 898                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 899                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 900                         ret = -EINVAL;
 901                         break;
 902                 }
 903
 904                 sk->sk_tsflags = val;
 905                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 906                         sock_enable_timestamp(sk,
 907                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 908                 else
 909                         sock_disable_timestamp(sk,
 910                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 911                 break;
 912
 913         case SO_RCVLOWAT:
 914                 if (val < 0)
 915                         val = INT_MAX;
 916                 sk->sk_rcvlowat = val ? : 1;
 917                 break;
 918
 919         case SO_RCVTIMEO:
 920                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 921                 break;
 922
 923         case SO_SNDTIMEO:
 924                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 925                 break;
 926
 927         case SO_ATTACH_FILTER:
 928                 ret = -EINVAL;
 929                 if (optlen == sizeof(struct sock_fprog)) {
 930                         struct sock_fprog fprog;
 931
 932                         ret = -EFAULT;
 933                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 934                                 break;
 935
 936                         ret = sk_attach_filter(&fprog, sk);
 937                 }
 938                 break;
 939
 940         case SO_ATTACH_BPF:
 941                 ret = -EINVAL;
 942                 if (optlen == sizeof(u32)) {
 943                         u32 ufd;
 944
 945                         ret = -EFAULT;
 946                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 947                                 break;
 948
 949                         ret = sk_attach_bpf(ufd, sk);
 950                 }
 951                 break;
 952
 953         case SO_ATTACH_REUSEPORT_CBPF:
 954                 ret = -EINVAL;
 955                 if (optlen == sizeof(struct sock_fprog)) {
 956                         struct sock_fprog fprog;
 957
 958                         ret = -EFAULT;
 959                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 960                                 break;
 961
 962                         ret = sk_reuseport_attach_filter(&fprog, sk);
 963                 }
 964                 break;
 965
 966         case SO_ATTACH_REUSEPORT_EBPF:
 967                 ret = -EINVAL;
 968                 if (optlen == sizeof(u32)) {
 969                         u32 ufd;
 970
 971                         ret = -EFAULT;
 972                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 973                                 break;
 974
 975                         ret = sk_reuseport_attach_bpf(ufd, sk);
 976                 }
 977                 break;
 978
 979         case SO_DETACH_FILTER:
 980                 ret = sk_detach_filter(sk);
 981                 break;
 982
 983         case SO_LOCK_FILTER:
 984                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 985                         ret = -EPERM;
 986                 else
 987                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 988                 break;
 989
 990         case SO_PASSSEC:
 991                 if (valbool)
 992                         set_bit(SOCK_PASSSEC, &sock->flags);
 993                 else
 994                         clear_bit(SOCK_PASSSEC, &sock->flags);
 995                 break;
 996         case SO_MARK:
 997                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 998                         ret = -EPERM;
 999                 else
1000                         sk->sk_mark = val;
1001                 break;
1002
1003         case SO_RXQ_OVFL:
1004                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005                 break;
1006
1007         case SO_WIFI_STATUS:
1008                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009                 break;
1010
1011         case SO_PEEK_OFF:
1012                 if (sock->ops->set_peek_off)
1013                         ret = sock->ops->set_peek_off(sk, val);
1014                 else
1015                         ret = -EOPNOTSUPP;
1016                 break;
1017
1018         case SO_NOFCS:
1019                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020                 break;
1021
1022         case SO_SELECT_ERR_QUEUE:
1023                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024                 break;
1025
1026 #ifdef CONFIG_NET_RX_BUSY_POLL
1027         case SO_BUSY_POLL:
1028                 /* allow unprivileged users to decrease the value */
1029                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030                         ret = -EPERM;
1031                 else {
1032                         if (val < 0)
1033                                 ret = -EINVAL;
1034                         else
1035                                 sk->sk_ll_usec = val;
1036                 }
1037                 break;
1038 #endif
1039
1040         case SO_MAX_PACING_RATE:
1041                 sk->sk_max_pacing_rate = val;
1042                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1043                                          sk->sk_max_pacing_rate);
1044                 break;
1045
1046         case SO_INCOMING_CPU:
1047                 sk->sk_incoming_cpu = val;
1048                 break;
1049
1050         case SO_CNX_ADVICE:
1051                 if (val == 1)
1052                         dst_negative_advice(sk);
1053                 break;
1054         default:
1055                 ret = -ENOPROTOOPT;
1056                 break;
1057         }
1058         release_sock(sk);
1059         return ret;
1060 }
1061 EXPORT_SYMBOL(sock_setsockopt);
1062
1063
1064 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1065                           struct ucred *ucred)
1066 {
1067         ucred->pid = pid_vnr(pid);
1068         ucred->uid = ucred->gid = -1;
1069         if (cred) {
1070                 struct user_namespace *current_ns = current_user_ns();
1071
1072                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1073                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1074         }
1075 }
1076
1077 int sock_getsockopt(struct socket *sock, int level, int optname,
1078                     char __user *optval, int __user *optlen)
1079 {
1080         struct sock *sk = sock->sk;
1081
1082         union {
1083                 int val;
1084                 u64 val64;
1085                 struct linger ling;
1086                 struct timeval tm;
1087         } v;
1088
1089         int lv = sizeof(int);
1090         int len;
1091
1092         if (get_user(len, optlen))
1093                 return -EFAULT;
1094         if (len < 0)
1095                 return -EINVAL;
1096
1097         memset(&v, 0, sizeof(v));
1098
1099         switch (optname) {
1100         case SO_DEBUG:
1101                 v.val = sock_flag(sk, SOCK_DBG);
1102                 break;
1103
1104         case SO_DONTROUTE:
1105                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1106                 break;
1107
1108         case SO_BROADCAST:
1109                 v.val = sock_flag(sk, SOCK_BROADCAST);
1110                 break;
1111
1112         case SO_SNDBUF:
1113                 v.val = sk->sk_sndbuf;
1114                 break;
1115
1116         case SO_RCVBUF:
1117                 v.val = sk->sk_rcvbuf;
1118                 break;
1119
1120         case SO_REUSEADDR:
1121                 v.val = sk->sk_reuse;
1122                 break;
1123
1124         case SO_REUSEPORT:
1125                 v.val = sk->sk_reuseport;
1126                 break;
1127
1128         case SO_KEEPALIVE:
1129                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1130                 break;
1131
1132         case SO_TYPE:
1133                 v.val = sk->sk_type;
1134                 break;
1135
1136         case SO_PROTOCOL:
1137                 v.val = sk->sk_protocol;
1138                 break;
1139
1140         case SO_DOMAIN:
1141                 v.val = sk->sk_family;
1142                 break;
1143
1144         case SO_ERROR:
1145                 v.val = -sock_error(sk);
1146                 if (v.val == 0)
1147                         v.val = xchg(&sk->sk_err_soft, 0);
1148                 break;
1149
1150         case SO_OOBINLINE:
1151                 v.val = sock_flag(sk, SOCK_URGINLINE);
1152                 break;
1153
1154         case SO_NO_CHECK:
1155                 v.val = sk->sk_no_check_tx;
1156                 break;
1157
1158         case SO_PRIORITY:
1159                 v.val = sk->sk_priority;
1160                 break;
1161
1162         case SO_LINGER:
1163                 lv              = sizeof(v.ling);
1164                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1165                 v.ling.l_linger = sk->sk_lingertime / HZ;
1166                 break;
1167
1168         case SO_BSDCOMPAT:
1169                 sock_warn_obsolete_bsdism("getsockopt");
1170                 break;
1171
1172         case SO_TIMESTAMP:
1173                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1174                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1175                 break;
1176
1177         case SO_TIMESTAMPNS:
1178                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1179                 break;
1180
1181         case SO_TIMESTAMPING:
1182                 v.val = sk->sk_tsflags;
1183                 break;
1184
1185         case SO_RCVTIMEO:
1186                 lv = sizeof(struct timeval);
1187                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1188                         v.tm.tv_sec = 0;
1189                         v.tm.tv_usec = 0;
1190                 } else {
1191                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1192                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1193                 }
1194                 break;
1195
1196         case SO_SNDTIMEO:
1197                 lv = sizeof(struct timeval);
1198                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1199                         v.tm.tv_sec = 0;
1200                         v.tm.tv_usec = 0;
1201                 } else {
1202                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1203                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1204                 }
1205                 break;
1206
1207         case SO_RCVLOWAT:
1208                 v.val = sk->sk_rcvlowat;
1209                 break;
1210
1211         case SO_SNDLOWAT:
1212                 v.val = 1;
1213                 break;
1214
1215         case SO_PASSCRED:
1216                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1217                 break;
1218
1219         case SO_PEERCRED:
1220         {
1221                 struct ucred peercred;
1222                 if (len > sizeof(peercred))
1223                         len = sizeof(peercred);
1224                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1225                 if (copy_to_user(optval, &peercred, len))
1226                         return -EFAULT;
1227                 goto lenout;
1228         }
1229
1230         case SO_PEERNAME:
1231         {
1232                 char address[128];
1233
1234                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1235                         return -ENOTCONN;
1236                 if (lv < len)
1237                         return -EINVAL;
1238                 if (copy_to_user(optval, address, len))
1239                         return -EFAULT;
1240                 goto lenout;
1241         }
1242
1243         /* Dubious BSD thing... Probably nobody even uses it, but
1244          * the UNIX standard wants it for whatever reason... -DaveM
1245          */
1246         case SO_ACCEPTCONN:
1247                 v.val = sk->sk_state == TCP_LISTEN;
1248                 break;
1249
1250         case SO_PASSSEC:
1251                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1252                 break;
1253
1254         case SO_PEERSEC:
1255                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1256
1257         case SO_MARK:
1258                 v.val = sk->sk_mark;
1259                 break;
1260
1261         case SO_RXQ_OVFL:
1262                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1263                 break;
1264
1265         case SO_WIFI_STATUS:
1266                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1267                 break;
1268
1269         case SO_PEEK_OFF:
1270                 if (!sock->ops->set_peek_off)
1271                         return -EOPNOTSUPP;
1272
1273                 v.val = sk->sk_peek_off;
1274                 break;
1275         case SO_NOFCS:
1276                 v.val = sock_flag(sk, SOCK_NOFCS);
1277                 break;
1278
1279         case SO_BINDTODEVICE:
1280                 return sock_getbindtodevice(sk, optval, optlen, len);
1281
1282         case SO_GET_FILTER:
1283                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1284                 if (len < 0)
1285                         return len;
1286
1287                 goto lenout;
1288
1289         case SO_LOCK_FILTER:
1290                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1291                 break;
1292
1293         case SO_BPF_EXTENSIONS:
1294                 v.val = bpf_tell_extensions();
1295                 break;
1296
1297         case SO_SELECT_ERR_QUEUE:
1298                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1299                 break;
1300
1301 #ifdef CONFIG_NET_RX_BUSY_POLL
1302         case SO_BUSY_POLL:
1303                 v.val = sk->sk_ll_usec;
1304                 break;
1305 #endif
1306
1307         case SO_MAX_PACING_RATE:
1308                 v.val = sk->sk_max_pacing_rate;
1309                 break;
1310
1311         case SO_INCOMING_CPU:
1312                 v.val = sk->sk_incoming_cpu;
1313                 break;
1314
1315         case SO_MEMINFO:
1316         {
1317                 u32 meminfo[SK_MEMINFO_VARS];
1318
1319                 if (get_user(len, optlen))
1320                         return -EFAULT;
1321
1322                 sk_get_meminfo(sk, meminfo);
1323
1324                 len = min_t(unsigned int, len, sizeof(meminfo));
1325                 if (copy_to_user(optval, &meminfo, len))
1326                         return -EFAULT;
1327
1328                 goto lenout;
1329         }
1330
1331 #ifdef CONFIG_NET_RX_BUSY_POLL
1332         case SO_INCOMING_NAPI_ID:
1333                 v.val = READ_ONCE(sk->sk_napi_id);
1334
1335                 /* aggregate non-NAPI IDs down to 0 */
1336                 if (v.val < MIN_NAPI_ID)
1337                         v.val = 0;
1338
1339                 break;
1340 #endif
1341
1342         case SO_COOKIE:
1343                 lv = sizeof(u64);
1344                 if (len < lv)
1345                         return -EINVAL;
1346                 v.val64 = sock_gen_cookie(sk);
1347                 break;
1348
1349         default:
1350                 /* We implement the SO_SNDLOWAT etc to not be settable
1351                  * (1003.1g 7).
1352                  */
1353                 return -ENOPROTOOPT;
1354         }
1355
1356         if (len > lv)
1357                 len = lv;
1358         if (copy_to_user(optval, &v, len))
1359                 return -EFAULT;
1360 lenout:
1361         if (put_user(len, optlen))
1362                 return -EFAULT;
1363         return 0;
1364 }
1365
1366 /*
1367  * Initialize an sk_lock.
1368  *
1369  * (We also register the sk_lock with the lock validator.)
1370  */
1371 static inline void sock_lock_init(struct sock *sk)
1372 {
1373         if (sk->sk_kern_sock)
1374                 sock_lock_init_class_and_name(
1375                         sk,
1376                         af_family_kern_slock_key_strings[sk->sk_family],
1377                         af_family_kern_slock_keys + sk->sk_family,
1378                         af_family_kern_key_strings[sk->sk_family],
1379                         af_family_kern_keys + sk->sk_family);
1380         else
1381                 sock_lock_init_class_and_name(
1382                         sk,
1383                         af_family_slock_key_strings[sk->sk_family],
1384                         af_family_slock_keys + sk->sk_family,
1385                         af_family_key_strings[sk->sk_family],
1386                         af_family_keys + sk->sk_family);
1387 }
1388
1389 /*
1390  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1391  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1392  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1393  */
1394 static void sock_copy(struct sock *nsk, const struct sock *osk)
1395 {
1396 #ifdef CONFIG_SECURITY_NETWORK
1397         void *sptr = nsk->sk_security;
1398 #endif
1399         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1400
1401         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1402                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1403
1404 #ifdef CONFIG_SECURITY_NETWORK
1405         nsk->sk_security = sptr;
1406         security_sk_clone(osk, nsk);
1407 #endif
1408 }
1409
1410 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1411                 int family)
1412 {
1413         struct sock *sk;
1414         struct kmem_cache *slab;
1415
1416         slab = prot->slab;
1417         if (slab != NULL) {
1418                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1419                 if (!sk)
1420                         return sk;
1421                 if (priority & __GFP_ZERO)
1422                         sk_prot_clear_nulls(sk, prot->obj_size);
1423         } else
1424                 sk = kmalloc(prot->obj_size, priority);
1425
1426         if (sk != NULL) {
1427                 kmemcheck_annotate_bitfield(sk, flags);
1428
1429                 if (security_sk_alloc(sk, family, priority))
1430                         goto out_free;
1431
1432                 if (!try_module_get(prot->owner))
1433                         goto out_free_sec;
1434                 sk_tx_queue_clear(sk);
1435         }
1436
1437         return sk;
1438
1439 out_free_sec:
1440         security_sk_free(sk);
1441 out_free:
1442         if (slab != NULL)
1443                 kmem_cache_free(slab, sk);
1444         else
1445                 kfree(sk);
1446         return NULL;
1447 }
1448
1449 static void sk_prot_free(struct proto *prot, struct sock *sk)
1450 {
1451         struct kmem_cache *slab;
1452         struct module *owner;
1453
1454         owner = prot->owner;
1455         slab = prot->slab;
1456
1457         cgroup_sk_free(&sk->sk_cgrp_data);
1458         mem_cgroup_sk_free(sk);
1459         security_sk_free(sk);
1460         if (slab != NULL)
1461                 kmem_cache_free(slab, sk);
1462         else
1463                 kfree(sk);
1464         module_put(owner);
1465 }
1466
1467 /**
1468  *      sk_alloc - All socket objects are allocated here
1469  *      @net: the applicable net namespace
1470  *      @family: protocol family
1471  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1472  *      @prot: struct proto associated with this new sock instance
1473  *      @kern: is this to be a kernel socket?
1474  */
1475 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1476                       struct proto *prot, int kern)
1477 {
1478         struct sock *sk;
1479
1480         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1481         if (sk) {
1482                 sk->sk_family = family;
1483                 /*
1484                  * See comment in struct sock definition to understand
1485                  * why we need sk_prot_creator -acme
1486                  */
1487                 sk->sk_prot = sk->sk_prot_creator = prot;
1488                 sk->sk_kern_sock = kern;
1489                 sock_lock_init(sk);
1490                 sk->sk_net_refcnt = kern ? 0 : 1;
1491                 if (likely(sk->sk_net_refcnt))
1492                         get_net(net);
1493                 sock_net_set(sk, net);
1494                 atomic_set(&sk->sk_wmem_alloc, 1);
1495
1496                 mem_cgroup_sk_alloc(sk);
1497                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1498                 sock_update_classid(&sk->sk_cgrp_data);
1499                 sock_update_netprioidx(&sk->sk_cgrp_data);
1500         }
1501
1502         return sk;
1503 }
1504 EXPORT_SYMBOL(sk_alloc);
1505
1506 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1507  * grace period. This is the case for UDP sockets and TCP listeners.
1508  */
1509 static void __sk_destruct(struct rcu_head *head)
1510 {
1511         struct sock *sk = container_of(head, struct sock, sk_rcu);
1512         struct sk_filter *filter;
1513
1514         if (sk->sk_destruct)
1515                 sk->sk_destruct(sk);
1516
1517         filter = rcu_dereference_check(sk->sk_filter,
1518                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1519         if (filter) {
1520                 sk_filter_uncharge(sk, filter);
1521                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1522         }
1523         if (rcu_access_pointer(sk->sk_reuseport_cb))
1524                 reuseport_detach_sock(sk);
1525
1526         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1527
1528         if (atomic_read(&sk->sk_omem_alloc))
1529                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1530                          __func__, atomic_read(&sk->sk_omem_alloc));
1531
1532         if (sk->sk_frag.page) {
1533                 put_page(sk->sk_frag.page);
1534                 sk->sk_frag.page = NULL;
1535         }
1536
1537         if (sk->sk_peer_cred)
1538                 put_cred(sk->sk_peer_cred);
1539         put_pid(sk->sk_peer_pid);
1540         if (likely(sk->sk_net_refcnt))
1541                 put_net(sock_net(sk));
1542         sk_prot_free(sk->sk_prot_creator, sk);
1543 }
1544
1545 void sk_destruct(struct sock *sk)
1546 {
1547         if (sock_flag(sk, SOCK_RCU_FREE))
1548                 call_rcu(&sk->sk_rcu, __sk_destruct);
1549         else
1550                 __sk_destruct(&sk->sk_rcu);
1551 }
1552
1553 static void __sk_free(struct sock *sk)
1554 {
1555         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1556                 sock_diag_broadcast_destroy(sk);
1557         else
1558                 sk_destruct(sk);
1559 }
1560
1561 void sk_free(struct sock *sk)
1562 {
1563         /*
1564          * We subtract one from sk_wmem_alloc and can know if
1565          * some packets are still in some tx queue.
1566          * If not null, sock_wfree() will call __sk_free(sk) later
1567          */
1568         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1569                 __sk_free(sk);
1570 }
1571 EXPORT_SYMBOL(sk_free);
1572
1573 static void sk_init_common(struct sock *sk)
1574 {
1575         skb_queue_head_init(&sk->sk_receive_queue);
1576         skb_queue_head_init(&sk->sk_write_queue);
1577         skb_queue_head_init(&sk->sk_error_queue);
1578
1579         rwlock_init(&sk->sk_callback_lock);
1580         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1581                         af_rlock_keys + sk->sk_family,
1582                         af_family_rlock_key_strings[sk->sk_family]);
1583         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1584                         af_wlock_keys + sk->sk_family,
1585                         af_family_wlock_key_strings[sk->sk_family]);
1586         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1587                         af_elock_keys + sk->sk_family,
1588                         af_family_elock_key_strings[sk->sk_family]);
1589         lockdep_set_class_and_name(&sk->sk_callback_lock,
1590                         af_callback_keys + sk->sk_family,
1591                         af_family_clock_key_strings[sk->sk_family]);
1592 }
1593
1594 /**
1595  *      sk_clone_lock - clone a socket, and lock its clone
1596  *      @sk: the socket to clone
1597  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1598  *
1599  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1600  */
1601 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1602 {
1603         struct sock *newsk;
1604         bool is_charged = true;
1605
1606         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1607         if (newsk != NULL) {
1608                 struct sk_filter *filter;
1609
1610                 sock_copy(newsk, sk);
1611
1612                 /* SANITY */
1613                 if (likely(newsk->sk_net_refcnt))
1614                         get_net(sock_net(newsk));
1615                 sk_node_init(&newsk->sk_node);
1616                 sock_lock_init(newsk);
1617                 bh_lock_sock(newsk);
1618                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1619                 newsk->sk_backlog.len = 0;
1620
1621                 atomic_set(&newsk->sk_rmem_alloc, 0);
1622                 /*
1623                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1624                  */
1625                 atomic_set(&newsk->sk_wmem_alloc, 1);
1626                 atomic_set(&newsk->sk_omem_alloc, 0);
1627                 sk_init_common(newsk);
1628
1629                 newsk->sk_dst_cache     = NULL;
1630                 newsk->sk_dst_pending_confirm = 0;
1631                 newsk->sk_wmem_queued   = 0;
1632                 newsk->sk_forward_alloc = 0;
1633                 atomic_set(&newsk->sk_drops, 0);
1634                 newsk->sk_send_head     = NULL;
1635                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1636
1637                 sock_reset_flag(newsk, SOCK_DONE);
1638
1639                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1640                 if (filter != NULL)
1641                         /* though it's an empty new sock, the charging may fail
1642                          * if sysctl_optmem_max was changed between creation of
1643                          * original socket and cloning
1644                          */
1645                         is_charged = sk_filter_charge(newsk, filter);
1646
1647                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1648                         /* We need to make sure that we don't uncharge the new
1649                          * socket if we couldn't charge it in the first place
1650                          * as otherwise we uncharge the parent's filter.
1651                          */
1652                         if (!is_charged)
1653                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1654                         sk_free_unlock_clone(newsk);
1655                         newsk = NULL;
1656                         goto out;
1657                 }
1658                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1659
1660                 newsk->sk_err      = 0;
1661                 newsk->sk_err_soft = 0;
1662                 newsk->sk_priority = 0;
1663                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1664                 atomic64_set(&newsk->sk_cookie, 0);
1665
1666                 mem_cgroup_sk_alloc(newsk);
1667                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1668
1669                 /*
1670                  * Before updating sk_refcnt, we must commit prior changes to memory
1671                  * (Documentation/RCU/rculist_nulls.txt for details)
1672                  */
1673                 smp_wmb();
1674                 atomic_set(&newsk->sk_refcnt, 2);
1675
1676                 /*
1677                  * Increment the counter in the same struct proto as the master
1678                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1679                  * is the same as sk->sk_prot->socks, as this field was copied
1680                  * with memcpy).
1681                  *
1682                  * This _changes_ the previous behaviour, where
1683                  * tcp_create_openreq_child always was incrementing the
1684                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1685                  * to be taken into account in all callers. -acme
1686                  */
1687                 sk_refcnt_debug_inc(newsk);
1688                 sk_set_socket(newsk, NULL);
1689                 newsk->sk_wq = NULL;
1690
1691                 if (newsk->sk_prot->sockets_allocated)
1692                         sk_sockets_allocated_inc(newsk);
1693
1694                 if (sock_needs_netstamp(sk) &&
1695                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1696                         net_enable_timestamp();
1697         }
1698 out:
1699         return newsk;
1700 }
1701 EXPORT_SYMBOL_GPL(sk_clone_lock);
1702
1703 void sk_free_unlock_clone(struct sock *sk)
1704 {
1705         /* It is still raw copy of parent, so invalidate
1706          * destructor and make plain sk_free() */
1707         sk->sk_destruct = NULL;
1708         bh_unlock_sock(sk);
1709         sk_free(sk);
1710 }
1711 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1712
1713 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1714 {
1715         u32 max_segs = 1;
1716
1717         sk_dst_set(sk, dst);
1718         sk->sk_route_caps = dst->dev->features;
1719         if (sk->sk_route_caps & NETIF_F_GSO)
1720                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1721         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1722         if (sk_can_gso(sk)) {
1723                 if (dst->header_len) {
1724                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1725                 } else {
1726                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1727                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1728                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1729                 }
1730         }
1731         sk->sk_gso_max_segs = max_segs;
1732 }
1733 EXPORT_SYMBOL_GPL(sk_setup_caps);
1734
1735 /*
1736  *      Simple resource managers for sockets.
1737  */
1738
1739
1740 /*
1741  * Write buffer destructor automatically called from kfree_skb.
1742  */
1743 void sock_wfree(struct sk_buff *skb)
1744 {
1745         struct sock *sk = skb->sk;
1746         unsigned int len = skb->truesize;
1747
1748         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1749                 /*
1750                  * Keep a reference on sk_wmem_alloc, this will be released
1751                  * after sk_write_space() call
1752                  */
1753                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1754                 sk->sk_write_space(sk);
1755                 len = 1;
1756         }
1757         /*
1758          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1759          * could not do because of in-flight packets
1760          */
1761         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1762                 __sk_free(sk);
1763 }
1764 EXPORT_SYMBOL(sock_wfree);
1765
1766 /* This variant of sock_wfree() is used by TCP,
1767  * since it sets SOCK_USE_WRITE_QUEUE.
1768  */
1769 void __sock_wfree(struct sk_buff *skb)
1770 {
1771         struct sock *sk = skb->sk;
1772
1773         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1774                 __sk_free(sk);
1775 }
1776
1777 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1778 {
1779         skb_orphan(skb);
1780         skb->sk = sk;
1781 #ifdef CONFIG_INET
1782         if (unlikely(!sk_fullsock(sk))) {
1783                 skb->destructor = sock_edemux;
1784                 sock_hold(sk);
1785                 return;
1786         }
1787 #endif
1788         skb->destructor = sock_wfree;
1789         skb_set_hash_from_sk(skb, sk);
1790         /*
1791          * We used to take a refcount on sk, but following operation
1792          * is enough to guarantee sk_free() wont free this sock until
1793          * all in-flight packets are completed
1794          */
1795         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1796 }
1797 EXPORT_SYMBOL(skb_set_owner_w);
1798
1799 /* This helper is used by netem, as it can hold packets in its
1800  * delay queue. We want to allow the owner socket to send more
1801  * packets, as if they were already TX completed by a typical driver.
1802  * But we also want to keep skb->sk set because some packet schedulers
1803  * rely on it (sch_fq for example).
1804  */
1805 void skb_orphan_partial(struct sk_buff *skb)
1806 {
1807         if (skb_is_tcp_pure_ack(skb))
1808                 return;
1809
1810         if (skb->destructor == sock_wfree
1811 #ifdef CONFIG_INET
1812             || skb->destructor == tcp_wfree
1813 #endif
1814                 ) {
1815                 struct sock *sk = skb->sk;
1816
1817                 if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1818                         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1819                         skb->destructor = sock_efree;
1820                 }
1821         } else {
1822                 skb_orphan(skb);
1823         }
1824 }
1825 EXPORT_SYMBOL(skb_orphan_partial);
1826
1827 /*
1828  * Read buffer destructor automatically called from kfree_skb.
1829  */
1830 void sock_rfree(struct sk_buff *skb)
1831 {
1832         struct sock *sk = skb->sk;
1833         unsigned int len = skb->truesize;
1834
1835         atomic_sub(len, &sk->sk_rmem_alloc);
1836         sk_mem_uncharge(sk, len);
1837 }
1838 EXPORT_SYMBOL(sock_rfree);
1839
1840 /*
1841  * Buffer destructor for skbs that are not used directly in read or write
1842  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1843  */
1844 void sock_efree(struct sk_buff *skb)
1845 {
1846         sock_put(skb->sk);
1847 }
1848 EXPORT_SYMBOL(sock_efree);
1849
1850 kuid_t sock_i_uid(struct sock *sk)
1851 {
1852         kuid_t uid;
1853
1854         read_lock_bh(&sk->sk_callback_lock);
1855         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1856         read_unlock_bh(&sk->sk_callback_lock);
1857         return uid;
1858 }
1859 EXPORT_SYMBOL(sock_i_uid);
1860
1861 unsigned long sock_i_ino(struct sock *sk)
1862 {
1863         unsigned long ino;
1864
1865         read_lock_bh(&sk->sk_callback_lock);
1866         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1867         read_unlock_bh(&sk->sk_callback_lock);
1868         return ino;
1869 }
1870 EXPORT_SYMBOL(sock_i_ino);
1871
1872 /*
1873  * Allocate a skb from the socket's send buffer.
1874  */
1875 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1876                              gfp_t priority)
1877 {
1878         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1879                 struct sk_buff *skb = alloc_skb(size, priority);
1880                 if (skb) {
1881                         skb_set_owner_w(skb, sk);
1882                         return skb;
1883                 }
1884         }
1885         return NULL;
1886 }
1887 EXPORT_SYMBOL(sock_wmalloc);
1888
1889 /*
1890  * Allocate a memory block from the socket's option memory buffer.
1891  */
1892 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1893 {
1894         if ((unsigned int)size <= sysctl_optmem_max &&
1895             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1896                 void *mem;
1897                 /* First do the add, to avoid the race if kmalloc
1898                  * might sleep.
1899                  */
1900                 atomic_add(size, &sk->sk_omem_alloc);
1901                 mem = kmalloc(size, priority);
1902                 if (mem)
1903                         return mem;
1904                 atomic_sub(size, &sk->sk_omem_alloc);
1905         }
1906         return NULL;
1907 }
1908 EXPORT_SYMBOL(sock_kmalloc);
1909
1910 /* Free an option memory block. Note, we actually want the inline
1911  * here as this allows gcc to detect the nullify and fold away the
1912  * condition entirely.
1913  */
1914 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1915                                   const bool nullify)
1916 {
1917         if (WARN_ON_ONCE(!mem))
1918                 return;
1919         if (nullify)
1920                 kzfree(mem);
1921         else
1922                 kfree(mem);
1923         atomic_sub(size, &sk->sk_omem_alloc);
1924 }
1925
1926 void sock_kfree_s(struct sock *sk, void *mem, int size)
1927 {
1928         __sock_kfree_s(sk, mem, size, false);
1929 }
1930 EXPORT_SYMBOL(sock_kfree_s);
1931
1932 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1933 {
1934         __sock_kfree_s(sk, mem, size, true);
1935 }
1936 EXPORT_SYMBOL(sock_kzfree_s);
1937
1938 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1939    I think, these locks should be removed for datagram sockets.
1940  */
1941 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1942 {
1943         DEFINE_WAIT(wait);
1944
1945         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1946         for (;;) {
1947                 if (!timeo)
1948                         break;
1949                 if (signal_pending(current))
1950                         break;
1951                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1952                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1953                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1954                         break;
1955                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1956                         break;
1957                 if (sk->sk_err)
1958                         break;
1959                 timeo = schedule_timeout(timeo);
1960         }
1961         finish_wait(sk_sleep(sk), &wait);
1962         return timeo;
1963 }
1964
1965
1966 /*
1967  *      Generic send/receive buffer handlers
1968  */
1969
1970 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1971                                      unsigned long data_len, int noblock,
1972                                      int *errcode, int max_page_order)
1973 {
1974         struct sk_buff *skb;
1975         long timeo;
1976         int err;
1977
1978         timeo = sock_sndtimeo(sk, noblock);
1979         for (;;) {
1980                 err = sock_error(sk);
1981                 if (err != 0)
1982                         goto failure;
1983
1984                 err = -EPIPE;
1985                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1986                         goto failure;
1987
1988                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1989                         break;
1990
1991                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1992                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1993                 err = -EAGAIN;
1994                 if (!timeo)
1995                         goto failure;
1996                 if (signal_pending(current))
1997                         goto interrupted;
1998                 timeo = sock_wait_for_wmem(sk, timeo);
1999         }
2000         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2001                                    errcode, sk->sk_allocation);
2002         if (skb)
2003                 skb_set_owner_w(skb, sk);
2004         return skb;
2005
2006 interrupted:
2007         err = sock_intr_errno(timeo);
2008 failure:
2009         *errcode = err;
2010         return NULL;
2011 }
2012 EXPORT_SYMBOL(sock_alloc_send_pskb);
2013
2014 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2015                                     int noblock, int *errcode)
2016 {
2017         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2018 }
2019 EXPORT_SYMBOL(sock_alloc_send_skb);
2020
2021 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2022                      struct sockcm_cookie *sockc)
2023 {
2024         u32 tsflags;
2025
2026         switch (cmsg->cmsg_type) {
2027         case SO_MARK:
2028                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2029                         return -EPERM;
2030                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2031                         return -EINVAL;
2032                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2033                 break;
2034         case SO_TIMESTAMPING:
2035                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2036                         return -EINVAL;
2037
2038                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2039                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2040                         return -EINVAL;
2041
2042                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2043                 sockc->tsflags |= tsflags;
2044                 break;
2045         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2046         case SCM_RIGHTS:
2047         case SCM_CREDENTIALS:
2048                 break;
2049         default:
2050                 return -EINVAL;
2051         }
2052         return 0;
2053 }
2054 EXPORT_SYMBOL(__sock_cmsg_send);
2055
2056 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2057                    struct sockcm_cookie *sockc)
2058 {
2059         struct cmsghdr *cmsg;
2060         int ret;
2061
2062         for_each_cmsghdr(cmsg, msg) {
2063                 if (!CMSG_OK(msg, cmsg))
2064                         return -EINVAL;
2065                 if (cmsg->cmsg_level != SOL_SOCKET)
2066                         continue;
2067                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2068                 if (ret)
2069                         return ret;
2070         }
2071         return 0;
2072 }
2073 EXPORT_SYMBOL(sock_cmsg_send);
2074
2075 /* On 32bit arches, an skb frag is limited to 2^15 */
2076 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2077
2078 /**
2079  * skb_page_frag_refill - check that a page_frag contains enough room
2080  * @sz: minimum size of the fragment we want to get
2081  * @pfrag: pointer to page_frag
2082  * @gfp: priority for memory allocation
2083  *
2084  * Note: While this allocator tries to use high order pages, there is
2085  * no guarantee that allocations succeed. Therefore, @sz MUST be
2086  * less or equal than PAGE_SIZE.
2087  */
2088 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2089 {
2090         if (pfrag->page) {
2091                 if (page_ref_count(pfrag->page) == 1) {
2092                         pfrag->offset = 0;
2093                         return true;
2094                 }
2095                 if (pfrag->offset + sz <= pfrag->size)
2096                         return true;
2097                 put_page(pfrag->page);
2098         }
2099
2100         pfrag->offset = 0;
2101         if (SKB_FRAG_PAGE_ORDER) {
2102                 /* Avoid direct reclaim but allow kswapd to wake */
2103                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2104                                           __GFP_COMP | __GFP_NOWARN |
2105                                           __GFP_NORETRY,
2106                                           SKB_FRAG_PAGE_ORDER);
2107                 if (likely(pfrag->page)) {
2108                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2109                         return true;
2110                 }
2111         }
2112         pfrag->page = alloc_page(gfp);
2113         if (likely(pfrag->page)) {
2114                 pfrag->size = PAGE_SIZE;
2115                 return true;
2116         }
2117         return false;
2118 }
2119 EXPORT_SYMBOL(skb_page_frag_refill);
2120
2121 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2122 {
2123         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2124                 return true;
2125
2126         sk_enter_memory_pressure(sk);
2127         sk_stream_moderate_sndbuf(sk);
2128         return false;
2129 }
2130 EXPORT_SYMBOL(sk_page_frag_refill);
2131
2132 static void __lock_sock(struct sock *sk)
2133         __releases(&sk->sk_lock.slock)
2134         __acquires(&sk->sk_lock.slock)
2135 {
2136         DEFINE_WAIT(wait);
2137
2138         for (;;) {
2139                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2140                                         TASK_UNINTERRUPTIBLE);
2141                 spin_unlock_bh(&sk->sk_lock.slock);
2142                 schedule();
2143                 spin_lock_bh(&sk->sk_lock.slock);
2144                 if (!sock_owned_by_user(sk))
2145                         break;
2146         }
2147         finish_wait(&sk->sk_lock.wq, &wait);
2148 }
2149
2150 static void __release_sock(struct sock *sk)
2151         __releases(&sk->sk_lock.slock)
2152         __acquires(&sk->sk_lock.slock)
2153 {
2154         struct sk_buff *skb, *next;
2155
2156         while ((skb = sk->sk_backlog.head) != NULL) {
2157                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2158
2159                 spin_unlock_bh(&sk->sk_lock.slock);
2160
2161                 do {
2162                         next = skb->next;
2163                         prefetch(next);
2164                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2165                         skb->next = NULL;
2166                         sk_backlog_rcv(sk, skb);
2167
2168                         cond_resched();
2169
2170                         skb = next;
2171                 } while (skb != NULL);
2172
2173                 spin_lock_bh(&sk->sk_lock.slock);
2174         }
2175
2176         /*
2177          * Doing the zeroing here guarantee we can not loop forever
2178          * while a wild producer attempts to flood us.
2179          */
2180         sk->sk_backlog.len = 0;
2181 }
2182
2183 void __sk_flush_backlog(struct sock *sk)
2184 {
2185         spin_lock_bh(&sk->sk_lock.slock);
2186         __release_sock(sk);
2187         spin_unlock_bh(&sk->sk_lock.slock);
2188 }
2189
2190 /**
2191  * sk_wait_data - wait for data to arrive at sk_receive_queue
2192  * @sk:    sock to wait on
2193  * @timeo: for how long
2194  * @skb:   last skb seen on sk_receive_queue
2195  *
2196  * Now socket state including sk->sk_err is changed only under lock,
2197  * hence we may omit checks after joining wait queue.
2198  * We check receive queue before schedule() only as optimization;
2199  * it is very likely that release_sock() added new data.
2200  */
2201 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2202 {
2203         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2204         int rc;
2205
2206         add_wait_queue(sk_sleep(sk), &wait);
2207         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2208         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2209         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2210         remove_wait_queue(sk_sleep(sk), &wait);
2211         return rc;
2212 }
2213 EXPORT_SYMBOL(sk_wait_data);
2214
2215 /**
2216  *      __sk_mem_raise_allocated - increase memory_allocated
2217  *      @sk: socket
2218  *      @size: memory size to allocate
2219  *      @amt: pages to allocate
2220  *      @kind: allocation type
2221  *
2222  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2223  */
2224 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2225 {
2226         struct proto *prot = sk->sk_prot;
2227         long allocated = sk_memory_allocated_add(sk, amt);
2228
2229         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2230             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2231                 goto suppress_allocation;
2232
2233         /* Under limit. */
2234         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2235                 sk_leave_memory_pressure(sk);
2236                 return 1;
2237         }
2238
2239         /* Under pressure. */
2240         if (allocated > sk_prot_mem_limits(sk, 1))
2241                 sk_enter_memory_pressure(sk);
2242
2243         /* Over hard limit. */
2244         if (allocated > sk_prot_mem_limits(sk, 2))
2245                 goto suppress_allocation;
2246
2247         /* guarantee minimum buffer size under pressure */
2248         if (kind == SK_MEM_RECV) {
2249                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2250                         return 1;
2251
2252         } else { /* SK_MEM_SEND */
2253                 if (sk->sk_type == SOCK_STREAM) {
2254                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2255                                 return 1;
2256                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2257                            prot->sysctl_wmem[0])
2258                                 return 1;
2259         }
2260
2261         if (sk_has_memory_pressure(sk)) {
2262                 int alloc;
2263
2264                 if (!sk_under_memory_pressure(sk))
2265                         return 1;
2266                 alloc = sk_sockets_allocated_read_positive(sk);
2267                 if (sk_prot_mem_limits(sk, 2) > alloc *
2268                     sk_mem_pages(sk->sk_wmem_queued +
2269                                  atomic_read(&sk->sk_rmem_alloc) +
2270                                  sk->sk_forward_alloc))
2271                         return 1;
2272         }
2273
2274 suppress_allocation:
2275
2276         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2277                 sk_stream_moderate_sndbuf(sk);
2278
2279                 /* Fail only if socket is _under_ its sndbuf.
2280                  * In this case we cannot block, so that we have to fail.
2281                  */
2282                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2283                         return 1;
2284         }
2285
2286         trace_sock_exceed_buf_limit(sk, prot, allocated);
2287
2288         sk_memory_allocated_sub(sk, amt);
2289
2290         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2291                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2292
2293         return 0;
2294 }
2295 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2296
2297 /**
2298  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2299  *      @sk: socket
2300  *      @size: memory size to allocate
2301  *      @kind: allocation type
2302  *
2303  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2304  *      rmem allocation. This function assumes that protocols which have
2305  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2306  */
2307 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2308 {
2309         int ret, amt = sk_mem_pages(size);
2310
2311         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2312         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2313         if (!ret)
2314                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2315         return ret;
2316 }
2317 EXPORT_SYMBOL(__sk_mem_schedule);
2318
2319 /**
2320  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2321  *      @sk: socket
2322  *      @amount: number of quanta
2323  *
2324  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2325  */
2326 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2327 {
2328         sk_memory_allocated_sub(sk, amount);
2329
2330         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2331                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2332
2333         if (sk_under_memory_pressure(sk) &&
2334             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2335                 sk_leave_memory_pressure(sk);
2336 }
2337 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2338
2339 /**
2340  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2341  *      @sk: socket
2342  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2343  */
2344 void __sk_mem_reclaim(struct sock *sk, int amount)
2345 {
2346         amount >>= SK_MEM_QUANTUM_SHIFT;
2347         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2348         __sk_mem_reduce_allocated(sk, amount);
2349 }
2350 EXPORT_SYMBOL(__sk_mem_reclaim);
2351
2352 int sk_set_peek_off(struct sock *sk, int val)
2353 {
2354         if (val < 0)
2355                 return -EINVAL;
2356
2357         sk->sk_peek_off = val;
2358         return 0;
2359 }
2360 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2361
2362 /*
2363  * Set of default routines for initialising struct proto_ops when
2364  * the protocol does not support a particular function. In certain
2365  * cases where it makes no sense for a protocol to have a "do nothing"
2366  * function, some default processing is provided.
2367  */
2368
2369 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2370 {
2371         return -EOPNOTSUPP;
2372 }
2373 EXPORT_SYMBOL(sock_no_bind);
2374
2375 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2376                     int len, int flags)
2377 {
2378         return -EOPNOTSUPP;
2379 }
2380 EXPORT_SYMBOL(sock_no_connect);
2381
2382 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2383 {
2384         return -EOPNOTSUPP;
2385 }
2386 EXPORT_SYMBOL(sock_no_socketpair);
2387
2388 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2389                    bool kern)
2390 {
2391         return -EOPNOTSUPP;
2392 }
2393 EXPORT_SYMBOL(sock_no_accept);
2394
2395 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2396                     int *len, int peer)
2397 {
2398         return -EOPNOTSUPP;
2399 }
2400 EXPORT_SYMBOL(sock_no_getname);
2401
2402 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2403 {
2404         return 0;
2405 }
2406 EXPORT_SYMBOL(sock_no_poll);
2407
2408 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2409 {
2410         return -EOPNOTSUPP;
2411 }
2412 EXPORT_SYMBOL(sock_no_ioctl);
2413
2414 int sock_no_listen(struct socket *sock, int backlog)
2415 {
2416         return -EOPNOTSUPP;
2417 }
2418 EXPORT_SYMBOL(sock_no_listen);
2419
2420 int sock_no_shutdown(struct socket *sock, int how)
2421 {
2422         return -EOPNOTSUPP;
2423 }
2424 EXPORT_SYMBOL(sock_no_shutdown);
2425
2426 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2427                     char __user *optval, unsigned int optlen)
2428 {
2429         return -EOPNOTSUPP;
2430 }
2431 EXPORT_SYMBOL(sock_no_setsockopt);
2432
2433 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2434                     char __user *optval, int __user *optlen)
2435 {
2436         return -EOPNOTSUPP;
2437 }
2438 EXPORT_SYMBOL(sock_no_getsockopt);
2439
2440 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2441 {
2442         return -EOPNOTSUPP;
2443 }
2444 EXPORT_SYMBOL(sock_no_sendmsg);
2445
2446 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2447                     int flags)
2448 {
2449         return -EOPNOTSUPP;
2450 }
2451 EXPORT_SYMBOL(sock_no_recvmsg);
2452
2453 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2454 {
2455         /* Mirror missing mmap method error code */
2456         return -ENODEV;
2457 }
2458 EXPORT_SYMBOL(sock_no_mmap);
2459
2460 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2461 {
2462         ssize_t res;
2463         struct msghdr msg = {.msg_flags = flags};
2464         struct kvec iov;
2465         char *kaddr = kmap(page);
2466         iov.iov_base = kaddr + offset;
2467         iov.iov_len = size;
2468         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2469         kunmap(page);
2470         return res;
2471 }
2472 EXPORT_SYMBOL(sock_no_sendpage);
2473
2474 /*
2475  *      Default Socket Callbacks
2476  */
2477
2478 static void sock_def_wakeup(struct sock *sk)
2479 {
2480         struct socket_wq *wq;
2481
2482         rcu_read_lock();
2483         wq = rcu_dereference(sk->sk_wq);
2484         if (skwq_has_sleeper(wq))
2485                 wake_up_interruptible_all(&wq->wait);
2486         rcu_read_unlock();
2487 }
2488
2489 static void sock_def_error_report(struct sock *sk)
2490 {
2491         struct socket_wq *wq;
2492
2493         rcu_read_lock();
2494         wq = rcu_dereference(sk->sk_wq);
2495         if (skwq_has_sleeper(wq))
2496                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2497         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2498         rcu_read_unlock();
2499 }
2500
2501 static void sock_def_readable(struct sock *sk)
2502 {
2503         struct socket_wq *wq;
2504
2505         rcu_read_lock();
2506         wq = rcu_dereference(sk->sk_wq);
2507         if (skwq_has_sleeper(wq))
2508                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2509                                                 POLLRDNORM | POLLRDBAND);
2510         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2511         rcu_read_unlock();
2512 }
2513
2514 static void sock_def_write_space(struct sock *sk)
2515 {
2516         struct socket_wq *wq;
2517
2518         rcu_read_lock();
2519
2520         /* Do not wake up a writer until he can make "significant"
2521          * progress.  --DaveM
2522          */
2523         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2524                 wq = rcu_dereference(sk->sk_wq);
2525                 if (skwq_has_sleeper(wq))
2526                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2527                                                 POLLWRNORM | POLLWRBAND);
2528
2529                 /* Should agree with poll, otherwise some programs break */
2530                 if (sock_writeable(sk))
2531                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2532         }
2533
2534         rcu_read_unlock();
2535 }
2536
2537 static void sock_def_destruct(struct sock *sk)
2538 {
2539 }
2540
2541 void sk_send_sigurg(struct sock *sk)
2542 {
2543         if (sk->sk_socket && sk->sk_socket->file)
2544                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2545                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2546 }
2547 EXPORT_SYMBOL(sk_send_sigurg);
2548
2549 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2550                     unsigned long expires)
2551 {
2552         if (!mod_timer(timer, expires))
2553                 sock_hold(sk);
2554 }
2555 EXPORT_SYMBOL(sk_reset_timer);
2556
2557 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2558 {
2559         if (del_timer(timer))
2560                 __sock_put(sk);
2561 }
2562 EXPORT_SYMBOL(sk_stop_timer);
2563
2564 void sock_init_data(struct socket *sock, struct sock *sk)
2565 {
2566         sk_init_common(sk);
2567         sk->sk_send_head        =       NULL;
2568
2569         init_timer(&sk->sk_timer);
2570
2571         sk->sk_allocation       =       GFP_KERNEL;
2572         sk->sk_rcvbuf           =       sysctl_rmem_default;
2573         sk->sk_sndbuf           =       sysctl_wmem_default;
2574         sk->sk_state            =       TCP_CLOSE;
2575         sk_set_socket(sk, sock);
2576
2577         sock_set_flag(sk, SOCK_ZAPPED);
2578
2579         if (sock) {
2580                 sk->sk_type     =       sock->type;
2581                 sk->sk_wq       =       sock->wq;
2582                 sock->sk        =       sk;
2583                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2584         } else {
2585                 sk->sk_wq       =       NULL;
2586                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2587         }
2588
2589         rwlock_init(&sk->sk_callback_lock);
2590         if (sk->sk_kern_sock)
2591                 lockdep_set_class_and_name(
2592                         &sk->sk_callback_lock,
2593                         af_kern_callback_keys + sk->sk_family,
2594                         af_family_kern_clock_key_strings[sk->sk_family]);
2595         else
2596                 lockdep_set_class_and_name(
2597                         &sk->sk_callback_lock,
2598                         af_callback_keys + sk->sk_family,
2599                         af_family_clock_key_strings[sk->sk_family]);
2600
2601         sk->sk_state_change     =       sock_def_wakeup;
2602         sk->sk_data_ready       =       sock_def_readable;
2603         sk->sk_write_space      =       sock_def_write_space;
2604         sk->sk_error_report     =       sock_def_error_report;
2605         sk->sk_destruct         =       sock_def_destruct;
2606
2607         sk->sk_frag.page        =       NULL;
2608         sk->sk_frag.offset      =       0;
2609         sk->sk_peek_off         =       -1;
2610
2611         sk->sk_peer_pid         =       NULL;
2612         sk->sk_peer_cred        =       NULL;
2613         sk->sk_write_pending    =       0;
2614         sk->sk_rcvlowat         =       1;
2615         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2616         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2617
2618         sk->sk_stamp = SK_DEFAULT_STAMP;
2619
2620 #ifdef CONFIG_NET_RX_BUSY_POLL
2621         sk->sk_napi_id          =       0;
2622         sk->sk_ll_usec          =       sysctl_net_busy_read;
2623 #endif
2624
2625         sk->sk_max_pacing_rate = ~0U;
2626         sk->sk_pacing_rate = ~0U;
2627         sk->sk_incoming_cpu = -1;
2628         /*
2629          * Before updating sk_refcnt, we must commit prior changes to memory
2630          * (Documentation/RCU/rculist_nulls.txt for details)
2631          */
2632         smp_wmb();
2633         atomic_set(&sk->sk_refcnt, 1);
2634         atomic_set(&sk->sk_drops, 0);
2635 }
2636 EXPORT_SYMBOL(sock_init_data);
2637
2638 void lock_sock_nested(struct sock *sk, int subclass)
2639 {
2640         might_sleep();
2641         spin_lock_bh(&sk->sk_lock.slock);
2642         if (sk->sk_lock.owned)
2643                 __lock_sock(sk);
2644         sk->sk_lock.owned = 1;
2645         spin_unlock(&sk->sk_lock.slock);
2646         /*
2647          * The sk_lock has mutex_lock() semantics here:
2648          */
2649         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2650         local_bh_enable();
2651 }
2652 EXPORT_SYMBOL(lock_sock_nested);
2653
2654 void release_sock(struct sock *sk)
2655 {
2656         spin_lock_bh(&sk->sk_lock.slock);
2657         if (sk->sk_backlog.tail)
2658                 __release_sock(sk);
2659
2660         /* Warning : release_cb() might need to release sk ownership,
2661          * ie call sock_release_ownership(sk) before us.
2662          */
2663         if (sk->sk_prot->release_cb)
2664                 sk->sk_prot->release_cb(sk);
2665
2666         sock_release_ownership(sk);
2667         if (waitqueue_active(&sk->sk_lock.wq))
2668                 wake_up(&sk->sk_lock.wq);
2669         spin_unlock_bh(&sk->sk_lock.slock);
2670 }
2671 EXPORT_SYMBOL(release_sock);
2672
2673 /**
2674  * lock_sock_fast - fast version of lock_sock
2675  * @sk: socket
2676  *
2677  * This version should be used for very small section, where process wont block
2678  * return false if fast path is taken
2679  *   sk_lock.slock locked, owned = 0, BH disabled
2680  * return true if slow path is taken
2681  *   sk_lock.slock unlocked, owned = 1, BH enabled
2682  */
2683 bool lock_sock_fast(struct sock *sk)
2684 {
2685         might_sleep();
2686         spin_lock_bh(&sk->sk_lock.slock);
2687
2688         if (!sk->sk_lock.owned)
2689                 /*
2690                  * Note : We must disable BH
2691                  */
2692                 return false;
2693
2694         __lock_sock(sk);
2695         sk->sk_lock.owned = 1;
2696         spin_unlock(&sk->sk_lock.slock);
2697         /*
2698          * The sk_lock has mutex_lock() semantics here:
2699          */
2700         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2701         local_bh_enable();
2702         return true;
2703 }
2704 EXPORT_SYMBOL(lock_sock_fast);
2705
2706 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2707 {
2708         struct timeval tv;
2709         if (!sock_flag(sk, SOCK_TIMESTAMP))
2710                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2711         tv = ktime_to_timeval(sk->sk_stamp);
2712         if (tv.tv_sec == -1)
2713                 return -ENOENT;
2714         if (tv.tv_sec == 0) {
2715                 sk->sk_stamp = ktime_get_real();
2716                 tv = ktime_to_timeval(sk->sk_stamp);
2717         }
2718         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2719 }
2720 EXPORT_SYMBOL(sock_get_timestamp);
2721
2722 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2723 {
2724         struct timespec ts;
2725         if (!sock_flag(sk, SOCK_TIMESTAMP))
2726                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2727         ts = ktime_to_timespec(sk->sk_stamp);
2728         if (ts.tv_sec == -1)
2729                 return -ENOENT;
2730         if (ts.tv_sec == 0) {
2731                 sk->sk_stamp = ktime_get_real();
2732                 ts = ktime_to_timespec(sk->sk_stamp);
2733         }
2734         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2735 }
2736 EXPORT_SYMBOL(sock_get_timestampns);
2737
2738 void sock_enable_timestamp(struct sock *sk, int flag)
2739 {
2740         if (!sock_flag(sk, flag)) {
2741                 unsigned long previous_flags = sk->sk_flags;
2742
2743                 sock_set_flag(sk, flag);
2744                 /*
2745                  * we just set one of the two flags which require net
2746                  * time stamping, but time stamping might have been on
2747                  * already because of the other one
2748                  */
2749                 if (sock_needs_netstamp(sk) &&
2750                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2751                         net_enable_timestamp();
2752         }
2753 }
2754
2755 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2756                        int level, int type)
2757 {
2758         struct sock_exterr_skb *serr;
2759         struct sk_buff *skb;
2760         int copied, err;
2761
2762         err = -EAGAIN;
2763         skb = sock_dequeue_err_skb(sk);
2764         if (skb == NULL)
2765                 goto out;
2766
2767         copied = skb->len;
2768         if (copied > len) {
2769                 msg->msg_flags |= MSG_TRUNC;
2770                 copied = len;
2771         }
2772         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2773         if (err)
2774                 goto out_free_skb;
2775
2776         sock_recv_timestamp(msg, sk, skb);
2777
2778         serr = SKB_EXT_ERR(skb);
2779         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2780
2781         msg->msg_flags |= MSG_ERRQUEUE;
2782         err = copied;
2783
2784 out_free_skb:
2785         kfree_skb(skb);
2786 out:
2787         return err;
2788 }
2789 EXPORT_SYMBOL(sock_recv_errqueue);
2790
2791 /*
2792  *      Get a socket option on an socket.
2793  *
2794  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2795  *      asynchronous errors should be reported by getsockopt. We assume
2796  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2797  */
2798 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2799                            char __user *optval, int __user *optlen)
2800 {
2801         struct sock *sk = sock->sk;
2802
2803         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2804 }
2805 EXPORT_SYMBOL(sock_common_getsockopt);
2806
2807 #ifdef CONFIG_COMPAT
2808 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2809                                   char __user *optval, int __user *optlen)
2810 {
2811         struct sock *sk = sock->sk;
2812
2813         if (sk->sk_prot->compat_getsockopt != NULL)
2814                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2815                                                       optval, optlen);
2816         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2817 }
2818 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2819 #endif
2820
2821 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2822                         int flags)
2823 {
2824         struct sock *sk = sock->sk;
2825         int addr_len = 0;
2826         int err;
2827
2828         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2829                                    flags & ~MSG_DONTWAIT, &addr_len);
2830         if (err >= 0)
2831                 msg->msg_namelen = addr_len;
2832         return err;
2833 }
2834 EXPORT_SYMBOL(sock_common_recvmsg);
2835
2836 /*
2837  *      Set socket options on an inet socket.
2838  */
2839 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2840                            char __user *optval, unsigned int optlen)
2841 {
2842         struct sock *sk = sock->sk;
2843
2844         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2845 }
2846 EXPORT_SYMBOL(sock_common_setsockopt);
2847
2848 #ifdef CONFIG_COMPAT
2849 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2850                                   char __user *optval, unsigned int optlen)
2851 {
2852         struct sock *sk = sock->sk;
2853
2854         if (sk->sk_prot->compat_setsockopt != NULL)
2855                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2856                                                       optval, optlen);
2857         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2858 }
2859 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2860 #endif
2861
2862 void sk_common_release(struct sock *sk)
2863 {
2864         if (sk->sk_prot->destroy)
2865                 sk->sk_prot->destroy(sk);
2866
2867         /*
2868          * Observation: when sock_common_release is called, processes have
2869          * no access to socket. But net still has.
2870          * Step one, detach it from networking:
2871          *
2872          * A. Remove from hash tables.
2873          */
2874
2875         sk->sk_prot->unhash(sk);
2876
2877         /*
2878          * In this point socket cannot receive new packets, but it is possible
2879          * that some packets are in flight because some CPU runs receiver and
2880          * did hash table lookup before we unhashed socket. They will achieve
2881          * receive queue and will be purged by socket destructor.
2882          *
2883          * Also we still have packets pending on receive queue and probably,
2884          * our own packets waiting in device queues. sock_destroy will drain
2885          * receive queue, but transmitted packets will delay socket destruction
2886          * until the last reference will be released.
2887          */
2888
2889         sock_orphan(sk);
2890
2891         xfrm_sk_free_policy(sk);
2892
2893         sk_refcnt_debug_release(sk);
2894
2895         sock_put(sk);
2896 }
2897 EXPORT_SYMBOL(sk_common_release);
2898
2899 void sk_get_meminfo(const struct sock *sk, u32 *mem)
2900 {
2901         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2902
2903         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2904         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2905         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2906         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2907         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2908         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2909         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2910         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2911         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2912 }
2913
2914 #ifdef CONFIG_PROC_FS
2915 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2916 struct prot_inuse {
2917         int val[PROTO_INUSE_NR];
2918 };
2919
2920 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2921
2922 #ifdef CONFIG_NET_NS
2923 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2924 {
2925         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2926 }
2927 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2928
2929 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2930 {
2931         int cpu, idx = prot->inuse_idx;
2932         int res = 0;
2933
2934         for_each_possible_cpu(cpu)
2935                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2936
2937         return res >= 0 ? res : 0;
2938 }
2939 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2940
2941 static int __net_init sock_inuse_init_net(struct net *net)
2942 {
2943         net->core.inuse = alloc_percpu(struct prot_inuse);
2944         return net->core.inuse ? 0 : -ENOMEM;
2945 }
2946
2947 static void __net_exit sock_inuse_exit_net(struct net *net)
2948 {
2949         free_percpu(net->core.inuse);
2950 }
2951
2952 static struct pernet_operations net_inuse_ops = {
2953         .init = sock_inuse_init_net,
2954         .exit = sock_inuse_exit_net,
2955 };
2956
2957 static __init int net_inuse_init(void)
2958 {
2959         if (register_pernet_subsys(&net_inuse_ops))
2960                 panic("Cannot initialize net inuse counters");
2961
2962         return 0;
2963 }
2964
2965 core_initcall(net_inuse_init);
2966 #else
2967 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2968
2969 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2970 {
2971         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2972 }
2973 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2974
2975 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2976 {
2977         int cpu, idx = prot->inuse_idx;
2978         int res = 0;
2979
2980         for_each_possible_cpu(cpu)
2981                 res += per_cpu(prot_inuse, cpu).val[idx];
2982
2983         return res >= 0 ? res : 0;
2984 }
2985 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2986 #endif
2987
2988 static void assign_proto_idx(struct proto *prot)
2989 {
2990         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2991
2992         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2993                 pr_err("PROTO_INUSE_NR exhausted\n");
2994                 return;
2995         }
2996
2997         set_bit(prot->inuse_idx, proto_inuse_idx);
2998 }
2999
3000 static void release_proto_idx(struct proto *prot)
3001 {
3002         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3003                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3004 }
3005 #else
3006 static inline void assign_proto_idx(struct proto *prot)
3007 {
3008 }
3009
3010 static inline void release_proto_idx(struct proto *prot)
3011 {
3012 }
3013 #endif
3014
3015 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3016 {
3017         if (!rsk_prot)
3018                 return;
3019         kfree(rsk_prot->slab_name);
3020         rsk_prot->slab_name = NULL;
3021         kmem_cache_destroy(rsk_prot->slab);
3022         rsk_prot->slab = NULL;
3023 }
3024
3025 static int req_prot_init(const struct proto *prot)
3026 {
3027         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3028
3029         if (!rsk_prot)
3030                 return 0;
3031
3032         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3033                                         prot->name);
3034         if (!rsk_prot->slab_name)
3035                 return -ENOMEM;
3036
3037         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3038                                            rsk_prot->obj_size, 0,
3039                                            prot->slab_flags, NULL);
3040
3041         if (!rsk_prot->slab) {
3042                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3043                         prot->name);
3044                 return -ENOMEM;
3045         }
3046         return 0;
3047 }
3048
3049 int proto_register(struct proto *prot, int alloc_slab)
3050 {
3051         if (alloc_slab) {
3052                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3053                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3054                                         NULL);
3055
3056                 if (prot->slab == NULL) {
3057                         pr_crit("%s: Can't create sock SLAB cache!\n",
3058                                 prot->name);
3059                         goto out;
3060                 }
3061
3062                 if (req_prot_init(prot))
3063                         goto out_free_request_sock_slab;
3064
3065                 if (prot->twsk_prot != NULL) {
3066                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3067
3068                         if (prot->twsk_prot->twsk_slab_name == NULL)
3069                                 goto out_free_request_sock_slab;
3070
3071                         prot->twsk_prot->twsk_slab =
3072                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3073                                                   prot->twsk_prot->twsk_obj_size,
3074                                                   0,
3075                                                   prot->slab_flags,
3076                                                   NULL);
3077                         if (prot->twsk_prot->twsk_slab == NULL)
3078                                 goto out_free_timewait_sock_slab_name;
3079                 }
3080         }
3081
3082         mutex_lock(&proto_list_mutex);
3083         list_add(&prot->node, &proto_list);
3084         assign_proto_idx(prot);
3085         mutex_unlock(&proto_list_mutex);
3086         return 0;
3087
3088 out_free_timewait_sock_slab_name:
3089         kfree(prot->twsk_prot->twsk_slab_name);
3090 out_free_request_sock_slab:
3091         req_prot_cleanup(prot->rsk_prot);
3092
3093         kmem_cache_destroy(prot->slab);
3094         prot->slab = NULL;
3095 out:
3096         return -ENOBUFS;
3097 }
3098 EXPORT_SYMBOL(proto_register);
3099
3100 void proto_unregister(struct proto *prot)
3101 {
3102         mutex_lock(&proto_list_mutex);
3103         release_proto_idx(prot);
3104         list_del(&prot->node);
3105         mutex_unlock(&proto_list_mutex);
3106
3107         kmem_cache_destroy(prot->slab);
3108         prot->slab = NULL;
3109
3110         req_prot_cleanup(prot->rsk_prot);
3111
3112         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3113                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3114                 kfree(prot->twsk_prot->twsk_slab_name);
3115                 prot->twsk_prot->twsk_slab = NULL;
3116         }
3117 }
3118 EXPORT_SYMBOL(proto_unregister);
3119
3120 #ifdef CONFIG_PROC_FS
3121 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3122         __acquires(proto_list_mutex)
3123 {
3124         mutex_lock(&proto_list_mutex);
3125         return seq_list_start_head(&proto_list, *pos);
3126 }
3127
3128 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3129 {
3130         return seq_list_next(v, &proto_list, pos);
3131 }
3132
3133 static void proto_seq_stop(struct seq_file *seq, void *v)
3134         __releases(proto_list_mutex)
3135 {
3136         mutex_unlock(&proto_list_mutex);
3137 }
3138
3139 static char proto_method_implemented(const void *method)
3140 {
3141         return method == NULL ? 'n' : 'y';
3142 }
3143 static long sock_prot_memory_allocated(struct proto *proto)
3144 {
3145         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3146 }
3147
3148 static char *sock_prot_memory_pressure(struct proto *proto)
3149 {
3150         return proto->memory_pressure != NULL ?
3151         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3152 }
3153
3154 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3155 {
3156
3157         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3158                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3159                    proto->name,
3160                    proto->obj_size,
3161                    sock_prot_inuse_get(seq_file_net(seq), proto),
3162                    sock_prot_memory_allocated(proto),
3163                    sock_prot_memory_pressure(proto),
3164                    proto->max_header,
3165                    proto->slab == NULL ? "no" : "yes",
3166                    module_name(proto->owner),
3167                    proto_method_implemented(proto->close),
3168                    proto_method_implemented(proto->connect),
3169                    proto_method_implemented(proto->disconnect),
3170                    proto_method_implemented(proto->accept),
3171                    proto_method_implemented(proto->ioctl),
3172                    proto_method_implemented(proto->init),
3173                    proto_method_implemented(proto->destroy),
3174                    proto_method_implemented(proto->shutdown),
3175                    proto_method_implemented(proto->setsockopt),
3176                    proto_method_implemented(proto->getsockopt),
3177                    proto_method_implemented(proto->sendmsg),
3178                    proto_method_implemented(proto->recvmsg),
3179                    proto_method_implemented(proto->sendpage),
3180                    proto_method_implemented(proto->bind),
3181                    proto_method_implemented(proto->backlog_rcv),
3182                    proto_method_implemented(proto->hash),
3183                    proto_method_implemented(proto->unhash),
3184                    proto_method_implemented(proto->get_port),
3185                    proto_method_implemented(proto->enter_memory_pressure));
3186 }
3187
3188 static int proto_seq_show(struct seq_file *seq, void *v)
3189 {
3190         if (v == &proto_list)
3191                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3192                            "protocol",
3193                            "size",
3194                            "sockets",
3195                            "memory",
3196                            "press",
3197                            "maxhdr",
3198                            "slab",
3199                            "module",
3200                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3201         else
3202                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3203         return 0;
3204 }
3205
3206 static const struct seq_operations proto_seq_ops = {
3207         .start  = proto_seq_start,
3208         .next   = proto_seq_next,
3209         .stop   = proto_seq_stop,
3210         .show   = proto_seq_show,
3211 };
3212
3213 static int proto_seq_open(struct inode *inode, struct file *file)
3214 {
3215         return seq_open_net(inode, file, &proto_seq_ops,
3216                             sizeof(struct seq_net_private));
3217 }
3218
3219 static const struct file_operations proto_seq_fops = {
3220         .owner          = THIS_MODULE,
3221         .open           = proto_seq_open,
3222         .read           = seq_read,
3223         .llseek         = seq_lseek,
3224         .release        = seq_release_net,
3225 };
3226
3227 static __net_init int proto_init_net(struct net *net)
3228 {
3229         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3230                 return -ENOMEM;
3231
3232         return 0;
3233 }
3234
3235 static __net_exit void proto_exit_net(struct net *net)
3236 {
3237         remove_proc_entry("protocols", net->proc_net);
3238 }
3239
3240
3241 static __net_initdata struct pernet_operations proto_net_ops = {
3242         .init = proto_init_net,
3243         .exit = proto_exit_net,
3244 };
3245
3246 static int __init proto_init(void)
3247 {
3248         return register_pernet_subsys(&proto_net_ops);
3249 }
3250
3251 subsys_initcall(proto_init);
3252
3253 #endif /* PROC_FS */
3254
3255 #ifdef CONFIG_NET_RX_BUSY_POLL
3256 bool sk_busy_loop_end(void *p, unsigned long start_time)
3257 {
3258         struct sock *sk = p;
3259
3260         return !skb_queue_empty(&sk->sk_receive_queue) ||
3261                sk_busy_loop_timeout(sk, start_time);
3262 }
3263 EXPORT_SYMBOL(sk_busy_loop_end);
3264 #endif /* CONFIG_NET_RX_BUSY_POLL */