mm/slub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * SLUB: A slab allocator that limits cache line use instead of queuing
   4  * objects in per cpu and per node lists.
   5  *
   6  * The allocator synchronizes using per slab locks or atomic operations
   7  * and only uses a centralized lock to manage a pool of partial slabs.
   8  *
   9  * (C) 2007 SGI, Christoph Lameter
  10  * (C) 2011 Linux Foundation, Christoph Lameter
  11  */
  12
  13 #include <linux/mm.h>
  14 #include <linux/swap.h> /* mm_account_reclaimed_pages() */
  15 #include <linux/module.h>
  16 #include <linux/bit_spinlock.h>
  17 #include <linux/interrupt.h>
  18 #include <linux/swab.h>
  19 #include <linux/bitops.h>
  20 #include <linux/slab.h>
  21 #include "slab.h"
  22 #include <linux/proc_fs.h>
  23 #include <linux/seq_file.h>
  24 #include <linux/kasan.h>
  25 #include <linux/kmsan.h>
  26 #include <linux/cpu.h>
  27 #include <linux/cpuset.h>
  28 #include <linux/mempolicy.h>
  29 #include <linux/ctype.h>
  30 #include <linux/stackdepot.h>
  31 #include <linux/debugobjects.h>
  32 #include <linux/kallsyms.h>
  33 #include <linux/kfence.h>
  34 #include <linux/memory.h>
  35 #include <linux/math64.h>
  36 #include <linux/fault-inject.h>
  37 #include <linux/kmemleak.h>
  38 #include <linux/stacktrace.h>
  39 #include <linux/prefetch.h>
  40 #include <linux/memcontrol.h>
  41 #include <linux/random.h>
  42 #include <kunit/test.h>
  43 #include <kunit/test-bug.h>
  44 #include <linux/sort.h>
  45
  46 #include <linux/debugfs.h>
  47 #include <trace/events/kmem.h>
  48
  49 #include "internal.h"
  50
  51 /*
  52  * Lock order:
  53  *   1. slab_mutex (Global Mutex)
  54  *   2. node->list_lock (Spinlock)
  55  *   3. kmem_cache->cpu_slab->lock (Local lock)
  56  *   4. slab_lock(slab) (Only on some arches)
  57  *   5. object_map_lock (Only for debugging)
  58  *
  59  *   slab_mutex
  60  *
  61  *   The role of the slab_mutex is to protect the list of all the slabs
  62  *   and to synchronize major metadata changes to slab cache structures.
  63  *   Also synchronizes memory hotplug callbacks.
  64  *
  65  *   slab_lock
  66  *
  67  *   The slab_lock is a wrapper around the page lock, thus it is a bit
  68  *   spinlock.
  69  *
  70  *   The slab_lock is only used on arches that do not have the ability
  71  *   to do a cmpxchg_double. It only protects:
  72  *
  73  *      A. slab->freelist       -> List of free objects in a slab
  74  *      B. slab->inuse          -> Number of objects in use
  75  *      C. slab->objects        -> Number of objects in slab
  76  *      D. slab->frozen         -> frozen state
  77  *
  78  *   Frozen slabs
  79  *
  80  *   If a slab is frozen then it is exempt from list management. It is
  81  *   the cpu slab which is actively allocated from by the processor that
  82  *   froze it and it is not on any list. The processor that froze the
  83  *   slab is the one who can perform list operations on the slab. Other
  84  *   processors may put objects onto the freelist but the processor that
  85  *   froze the slab is the only one that can retrieve the objects from the
  86  *   slab's freelist.
  87  *
  88  *   CPU partial slabs
  89  *
  90  *   The partially empty slabs cached on the CPU partial list are used
  91  *   for performance reasons, which speeds up the allocation process.
  92  *   These slabs are not frozen, but are also exempt from list management,
  93  *   by clearing the PG_workingset flag when moving out of the node
  94  *   partial list. Please see __slab_free() for more details.
  95  *
  96  *   To sum up, the current scheme is:
  97  *   - node partial slab: PG_Workingset && !frozen
  98  *   - cpu partial slab: !PG_Workingset && !frozen
  99  *   - cpu slab: !PG_Workingset && frozen
 100  *   - full slab: !PG_Workingset && !frozen
 101  *
 102  *   list_lock
 103  *
 104  *   The list_lock protects the partial and full list on each node and
 105  *   the partial slab counter. If taken then no new slabs may be added or
 106  *   removed from the lists nor make the number of partial slabs be modified.
 107  *   (Note that the total number of slabs is an atomic value that may be
 108  *   modified without taking the list lock).
 109  *
 110  *   The list_lock is a centralized lock and thus we avoid taking it as
 111  *   much as possible. As long as SLUB does not have to handle partial
 112  *   slabs, operations can continue without any centralized lock. F.e.
 113  *   allocating a long series of objects that fill up slabs does not require
 114  *   the list lock.
 115  *
 116  *   For debug caches, all allocations are forced to go through a list_lock
 117  *   protected region to serialize against concurrent validation.
 118  *
 119  *   cpu_slab->lock local lock
 120  *
 121  *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
 122  *   except the stat counters. This is a percpu structure manipulated only by
 123  *   the local cpu, so the lock protects against being preempted or interrupted
 124  *   by an irq. Fast path operations rely on lockless operations instead.
 125  *
 126  *   On PREEMPT_RT, the local lock neither disables interrupts nor preemption
 127  *   which means the lockless fastpath cannot be used as it might interfere with
 128  *   an in-progress slow path operations. In this case the local lock is always
 129  *   taken but it still utilizes the freelist for the common operations.
 130  *
 131  *   lockless fastpaths
 132  *
 133  *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
 134  *   are fully lockless when satisfied from the percpu slab (and when
 135  *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
 136  *   They also don't disable preemption or migration or irqs. They rely on
 137  *   the transaction id (tid) field to detect being preempted or moved to
 138  *   another cpu.
 139  *
 140  *   irq, preemption, migration considerations
 141  *
 142  *   Interrupts are disabled as part of list_lock or local_lock operations, or
 143  *   around the slab_lock operation, in order to make the slab allocator safe
 144  *   to use in the context of an irq.
 145  *
 146  *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
 147  *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
 148  *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
 149  *   doesn't have to be revalidated in each section protected by the local lock.
 150  *
 151  * SLUB assigns one slab for allocation to each processor.
 152  * Allocations only occur from these slabs called cpu slabs.
 153  *
 154  * Slabs with free elements are kept on a partial list and during regular
 155  * operations no list for full slabs is used. If an object in a full slab is
 156  * freed then the slab will show up again on the partial lists.
 157  * We track full slabs for debugging purposes though because otherwise we
 158  * cannot scan all objects.
 159  *
 160  * Slabs are freed when they become empty. Teardown and setup is
 161  * minimal so we rely on the page allocators per cpu caches for
 162  * fast frees and allocs.
 163  *
 164  * slab->frozen         The slab is frozen and exempt from list processing.
 165  *                      This means that the slab is dedicated to a purpose
 166  *                      such as satisfying allocations for a specific
 167  *                      processor. Objects may be freed in the slab while
 168  *                      it is frozen but slab_free will then skip the usual
 169  *                      list operations. It is up to the processor holding
 170  *                      the slab to integrate the slab into the slab lists
 171  *                      when the slab is no longer needed.
 172  *
 173  *                      One use of this flag is to mark slabs that are
 174  *                      used for allocations. Then such a slab becomes a cpu
 175  *                      slab. The cpu slab may be equipped with an additional
 176  *                      freelist that allows lockless access to
 177  *                      free objects in addition to the regular freelist
 178  *                      that requires the slab lock.
 179  *
 180  * SLAB_DEBUG_FLAGS     Slab requires special handling due to debug
 181  *                      options set. This moves slab handling out of
 182  *                      the fast path and disables lockless freelists.
 183  */
 184
 185 /*
 186  * We could simply use migrate_disable()/enable() but as long as it's a
 187  * function call even on !PREEMPT_RT, use inline preempt_disable() there.
 188  */
 189 #ifndef CONFIG_PREEMPT_RT
 190 #define slub_get_cpu_ptr(var)           get_cpu_ptr(var)
 191 #define slub_put_cpu_ptr(var)           put_cpu_ptr(var)
 192 #define USE_LOCKLESS_FAST_PATH()        (true)
 193 #else
 194 #define slub_get_cpu_ptr(var)           \
 195 ({                                      \
 196         migrate_disable();              \
 197         this_cpu_ptr(var);              \
 198 })
 199 #define slub_put_cpu_ptr(var)           \
 200 do {                                    \
 201         (void)(var);                    \
 202         migrate_enable();               \
 203 } while (0)
 204 #define USE_LOCKLESS_FAST_PATH()        (false)
 205 #endif
 206
 207 #ifndef CONFIG_SLUB_TINY
 208 #define __fastpath_inline __always_inline
 209 #else
 210 #define __fastpath_inline
 211 #endif
 212
 213 #ifdef CONFIG_SLUB_DEBUG
 214 #ifdef CONFIG_SLUB_DEBUG_ON
 215 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
 216 #else
 217 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
 218 #endif
 219 #endif          /* CONFIG_SLUB_DEBUG */
 220
 221 /* Structure holding parameters for get_partial() call chain */
 222 struct partial_context {
 223         gfp_t flags;
 224         unsigned int orig_size;
 225         void *object;
 226 };
 227
 228 static inline bool kmem_cache_debug(struct kmem_cache *s)
 229 {
 230         return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
 231 }
 232
 233 static inline bool slub_debug_orig_size(struct kmem_cache *s)
 234 {
 235         return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
 236                         (s->flags & SLAB_KMALLOC));
 237 }
 238
 239 void *fixup_red_left(struct kmem_cache *s, void *p)
 240 {
 241         if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
 242                 p += s->red_left_pad;
 243
 244         return p;
 245 }
 246
 247 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 248 {
 249 #ifdef CONFIG_SLUB_CPU_PARTIAL
 250         return !kmem_cache_debug(s);
 251 #else
 252         return false;
 253 #endif
 254 }
 255
 256 /*
 257  * Issues still to be resolved:
 258  *
 259  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 260  *
 261  * - Variable sizing of the per node arrays
 262  */
 263
 264 /* Enable to log cmpxchg failures */
 265 #undef SLUB_DEBUG_CMPXCHG
 266
 267 #ifndef CONFIG_SLUB_TINY
 268 /*
 269  * Minimum number of partial slabs. These will be left on the partial
 270  * lists even if they are empty. kmem_cache_shrink may reclaim them.
 271  */
 272 #define MIN_PARTIAL 5
 273
 274 /*
 275  * Maximum number of desirable partial slabs.
 276  * The existence of more partial slabs makes kmem_cache_shrink
 277  * sort the partial list by the number of objects in use.
 278  */
 279 #define MAX_PARTIAL 10
 280 #else
 281 #define MIN_PARTIAL 0
 282 #define MAX_PARTIAL 0
 283 #endif
 284
 285 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
 286                                 SLAB_POISON | SLAB_STORE_USER)
 287
 288 /*
 289  * These debug flags cannot use CMPXCHG because there might be consistency
 290  * issues when checking or reading debug information
 291  */
 292 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
 293                                 SLAB_TRACE)
 294
 295
 296 /*
 297  * Debugging flags that require metadata to be stored in the slab.  These get
 298  * disabled when slab_debug=O is used and a cache's min order increases with
 299  * metadata.
 300  */
 301 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 302
 303 #define OO_SHIFT        16
 304 #define OO_MASK         ((1 << OO_SHIFT) - 1)
 305 #define MAX_OBJS_PER_PAGE       32767 /* since slab.objects is u15 */
 306
 307 /* Internal SLUB flags */
 308 /* Poison object */
 309 #define __OBJECT_POISON         __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
 310 /* Use cmpxchg_double */
 311
 312 #ifdef system_has_freelist_aba
 313 #define __CMPXCHG_DOUBLE        __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
 314 #else
 315 #define __CMPXCHG_DOUBLE        __SLAB_FLAG_UNUSED
 316 #endif
 317
 318 /*
 319  * Tracking user of a slab.
 320  */
 321 #define TRACK_ADDRS_COUNT 16
 322 struct track {
 323         unsigned long addr;     /* Called from address */
 324 #ifdef CONFIG_STACKDEPOT
 325         depot_stack_handle_t handle;
 326 #endif
 327         int cpu;                /* Was running on cpu */
 328         int pid;                /* Pid context */
 329         unsigned long when;     /* When did the operation occur */
 330 };
 331
 332 enum track_item { TRACK_ALLOC, TRACK_FREE };
 333
 334 #ifdef SLAB_SUPPORTS_SYSFS
 335 static int sysfs_slab_add(struct kmem_cache *);
 336 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 337 #else
 338 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 339 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 340                                                         { return 0; }
 341 #endif
 342
 343 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
 344 static void debugfs_slab_add(struct kmem_cache *);
 345 #else
 346 static inline void debugfs_slab_add(struct kmem_cache *s) { }
 347 #endif
 348
 349 enum stat_item {
 350         ALLOC_FASTPATH,         /* Allocation from cpu slab */
 351         ALLOC_SLOWPATH,         /* Allocation by getting a new cpu slab */
 352         FREE_FASTPATH,          /* Free to cpu slab */
 353         FREE_SLOWPATH,          /* Freeing not to cpu slab */
 354         FREE_FROZEN,            /* Freeing to frozen slab */
 355         FREE_ADD_PARTIAL,       /* Freeing moves slab to partial list */
 356         FREE_REMOVE_PARTIAL,    /* Freeing removes last object */
 357         ALLOC_FROM_PARTIAL,     /* Cpu slab acquired from node partial list */
 358         ALLOC_SLAB,             /* Cpu slab acquired from page allocator */
 359         ALLOC_REFILL,           /* Refill cpu slab from slab freelist */
 360         ALLOC_NODE_MISMATCH,    /* Switching cpu slab */
 361         FREE_SLAB,              /* Slab freed to the page allocator */
 362         CPUSLAB_FLUSH,          /* Abandoning of the cpu slab */
 363         DEACTIVATE_FULL,        /* Cpu slab was full when deactivated */
 364         DEACTIVATE_EMPTY,       /* Cpu slab was empty when deactivated */
 365         DEACTIVATE_TO_HEAD,     /* Cpu slab was moved to the head of partials */
 366         DEACTIVATE_TO_TAIL,     /* Cpu slab was moved to the tail of partials */
 367         DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
 368         DEACTIVATE_BYPASS,      /* Implicit deactivation */
 369         ORDER_FALLBACK,         /* Number of times fallback was necessary */
 370         CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
 371         CMPXCHG_DOUBLE_FAIL,    /* Failures of slab freelist update */
 372         CPU_PARTIAL_ALLOC,      /* Used cpu partial on alloc */
 373         CPU_PARTIAL_FREE,       /* Refill cpu partial on free */
 374         CPU_PARTIAL_NODE,       /* Refill cpu partial from node partial */
 375         CPU_PARTIAL_DRAIN,      /* Drain cpu partial to node partial */
 376         NR_SLUB_STAT_ITEMS
 377 };
 378
 379 #ifndef CONFIG_SLUB_TINY
 380 /*
 381  * When changing the layout, make sure freelist and tid are still compatible
 382  * with this_cpu_cmpxchg_double() alignment requirements.
 383  */
 384 struct kmem_cache_cpu {
 385         union {
 386                 struct {
 387                         void **freelist;        /* Pointer to next available object */
 388                         unsigned long tid;      /* Globally unique transaction id */
 389                 };
 390                 freelist_aba_t freelist_tid;
 391         };
 392         struct slab *slab;      /* The slab from which we are allocating */
 393 #ifdef CONFIG_SLUB_CPU_PARTIAL
 394         struct slab *partial;   /* Partially allocated slabs */
 395 #endif
 396         local_lock_t lock;      /* Protects the fields above */
 397 #ifdef CONFIG_SLUB_STATS
 398         unsigned int stat[NR_SLUB_STAT_ITEMS];
 399 #endif
 400 };
 401 #endif /* CONFIG_SLUB_TINY */
 402
 403 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 404 {
 405 #ifdef CONFIG_SLUB_STATS
 406         /*
 407          * The rmw is racy on a preemptible kernel but this is acceptable, so
 408          * avoid this_cpu_add()'s irq-disable overhead.
 409          */
 410         raw_cpu_inc(s->cpu_slab->stat[si]);
 411 #endif
 412 }
 413
 414 static inline
 415 void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
 416 {
 417 #ifdef CONFIG_SLUB_STATS
 418         raw_cpu_add(s->cpu_slab->stat[si], v);
 419 #endif
 420 }
 421
 422 /*
 423  * The slab lists for all objects.
 424  */
 425 struct kmem_cache_node {
 426         spinlock_t list_lock;
 427         unsigned long nr_partial;
 428         struct list_head partial;
 429 #ifdef CONFIG_SLUB_DEBUG
 430         atomic_long_t nr_slabs;
 431         atomic_long_t total_objects;
 432         struct list_head full;
 433 #endif
 434 };
 435
 436 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 437 {
 438         return s->node[node];
 439 }
 440
 441 /*
 442  * Iterator over all nodes. The body will be executed for each node that has
 443  * a kmem_cache_node structure allocated (which is true for all online nodes)
 444  */
 445 #define for_each_kmem_cache_node(__s, __node, __n) \
 446         for (__node = 0; __node < nr_node_ids; __node++) \
 447                  if ((__n = get_node(__s, __node)))
 448
 449 /*
 450  * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
 451  * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
 452  * differ during memory hotplug/hotremove operations.
 453  * Protected by slab_mutex.
 454  */
 455 static nodemask_t slab_nodes;
 456
 457 #ifndef CONFIG_SLUB_TINY
 458 /*
 459  * Workqueue used for flush_cpu_slab().
 460  */
 461 static struct workqueue_struct *flushwq;
 462 #endif
 463
 464 /********************************************************************
 465  *                      Core slab cache functions
 466  *******************************************************************/
 467
 468 /*
 469  * freeptr_t represents a SLUB freelist pointer, which might be encoded
 470  * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
 471  */
 472 typedef struct { unsigned long v; } freeptr_t;
 473
 474 /*
 475  * Returns freelist pointer (ptr). With hardening, this is obfuscated
 476  * with an XOR of the address where the pointer is held and a per-cache
 477  * random number.
 478  */
 479 static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
 480                                             void *ptr, unsigned long ptr_addr)
 481 {
 482         unsigned long encoded;
 483
 484 #ifdef CONFIG_SLAB_FREELIST_HARDENED
 485         encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
 486 #else
 487         encoded = (unsigned long)ptr;
 488 #endif
 489         return (freeptr_t){.v = encoded};
 490 }
 491
 492 static inline void *freelist_ptr_decode(const struct kmem_cache *s,
 493                                         freeptr_t ptr, unsigned long ptr_addr)
 494 {
 495         void *decoded;
 496
 497 #ifdef CONFIG_SLAB_FREELIST_HARDENED
 498         decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
 499 #else
 500         decoded = (void *)ptr.v;
 501 #endif
 502         return decoded;
 503 }
 504
 505 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 506 {
 507         unsigned long ptr_addr;
 508         freeptr_t p;
 509
 510         object = kasan_reset_tag(object);
 511         ptr_addr = (unsigned long)object + s->offset;
 512         p = *(freeptr_t *)(ptr_addr);
 513         return freelist_ptr_decode(s, p, ptr_addr);
 514 }
 515
 516 #ifndef CONFIG_SLUB_TINY
 517 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 518 {
 519         prefetchw(object + s->offset);
 520 }
 521 #endif
 522
 523 /*
 524  * When running under KMSAN, get_freepointer_safe() may return an uninitialized
 525  * pointer value in the case the current thread loses the race for the next
 526  * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
 527  * slab_alloc_node() will fail, so the uninitialized value won't be used, but
 528  * KMSAN will still check all arguments of cmpxchg because of imperfect
 529  * handling of inline assembly.
 530  * To work around this problem, we apply __no_kmsan_checks to ensure that
 531  * get_freepointer_safe() returns initialized memory.
 532  */
 533 __no_kmsan_checks
 534 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 535 {
 536         unsigned long freepointer_addr;
 537         freeptr_t p;
 538
 539         if (!debug_pagealloc_enabled_static())
 540                 return get_freepointer(s, object);
 541
 542         object = kasan_reset_tag(object);
 543         freepointer_addr = (unsigned long)object + s->offset;
 544         copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
 545         return freelist_ptr_decode(s, p, freepointer_addr);
 546 }
 547
 548 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 549 {
 550         unsigned long freeptr_addr = (unsigned long)object + s->offset;
 551
 552 #ifdef CONFIG_SLAB_FREELIST_HARDENED
 553         BUG_ON(object == fp); /* naive detection of double free or corruption */
 554 #endif
 555
 556         freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
 557         *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
 558 }
 559
 560 /*
 561  * See comment in calculate_sizes().
 562  */
 563 static inline bool freeptr_outside_object(struct kmem_cache *s)
 564 {
 565         return s->offset >= s->inuse;
 566 }
 567
 568 /*
 569  * Return offset of the end of info block which is inuse + free pointer if
 570  * not overlapping with object.
 571  */
 572 static inline unsigned int get_info_end(struct kmem_cache *s)
 573 {
 574         if (freeptr_outside_object(s))
 575                 return s->inuse + sizeof(void *);
 576         else
 577                 return s->inuse;
 578 }
 579
 580 /* Loop over all objects in a slab */
 581 #define for_each_object(__p, __s, __addr, __objects) \
 582         for (__p = fixup_red_left(__s, __addr); \
 583                 __p < (__addr) + (__objects) * (__s)->size; \
 584                 __p += (__s)->size)
 585
 586 static inline unsigned int order_objects(unsigned int order, unsigned int size)
 587 {
 588         return ((unsigned int)PAGE_SIZE << order) / size;
 589 }
 590
 591 static inline struct kmem_cache_order_objects oo_make(unsigned int order,
 592                 unsigned int size)
 593 {
 594         struct kmem_cache_order_objects x = {
 595                 (order << OO_SHIFT) + order_objects(order, size)
 596         };
 597
 598         return x;
 599 }
 600
 601 static inline unsigned int oo_order(struct kmem_cache_order_objects x)
 602 {
 603         return x.x >> OO_SHIFT;
 604 }
 605
 606 static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
 607 {
 608         return x.x & OO_MASK;
 609 }
 610
 611 #ifdef CONFIG_SLUB_CPU_PARTIAL
 612 static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
 613 {
 614         unsigned int nr_slabs;
 615
 616         s->cpu_partial = nr_objects;
 617
 618         /*
 619          * We take the number of objects but actually limit the number of
 620          * slabs on the per cpu partial list, in order to limit excessive
 621          * growth of the list. For simplicity we assume that the slabs will
 622          * be half-full.
 623          */
 624         nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
 625         s->cpu_partial_slabs = nr_slabs;
 626 }
 627
 628 static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
 629 {
 630         return s->cpu_partial_slabs;
 631 }
 632 #else
 633 static inline void
 634 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
 635 {
 636 }
 637
 638 static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
 639 {
 640         return 0;
 641 }
 642 #endif /* CONFIG_SLUB_CPU_PARTIAL */
 643
 644 /*
 645  * Per slab locking using the pagelock
 646  */
 647 static __always_inline void slab_lock(struct slab *slab)
 648 {
 649         bit_spin_lock(PG_locked, &slab->__page_flags);
 650 }
 651
 652 static __always_inline void slab_unlock(struct slab *slab)
 653 {
 654         bit_spin_unlock(PG_locked, &slab->__page_flags);
 655 }
 656
 657 static inline bool
 658 __update_freelist_fast(struct slab *slab,
 659                       void *freelist_old, unsigned long counters_old,
 660                       void *freelist_new, unsigned long counters_new)
 661 {
 662 #ifdef system_has_freelist_aba
 663         freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
 664         freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
 665
 666         return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
 667 #else
 668         return false;
 669 #endif
 670 }
 671
 672 static inline bool
 673 __update_freelist_slow(struct slab *slab,
 674                       void *freelist_old, unsigned long counters_old,
 675                       void *freelist_new, unsigned long counters_new)
 676 {
 677         bool ret = false;
 678
 679         slab_lock(slab);
 680         if (slab->freelist == freelist_old &&
 681             slab->counters == counters_old) {
 682                 slab->freelist = freelist_new;
 683                 slab->counters = counters_new;
 684                 ret = true;
 685         }
 686         slab_unlock(slab);
 687
 688         return ret;
 689 }
 690
 691 /*
 692  * Interrupts must be disabled (for the fallback code to work right), typically
 693  * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
 694  * part of bit_spin_lock(), is sufficient because the policy is not to allow any
 695  * allocation/ free operation in hardirq context. Therefore nothing can
 696  * interrupt the operation.
 697  */
 698 static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 699                 void *freelist_old, unsigned long counters_old,
 700                 void *freelist_new, unsigned long counters_new,
 701                 const char *n)
 702 {
 703         bool ret;
 704
 705         if (USE_LOCKLESS_FAST_PATH())
 706                 lockdep_assert_irqs_disabled();
 707
 708         if (s->flags & __CMPXCHG_DOUBLE) {
 709                 ret = __update_freelist_fast(slab, freelist_old, counters_old,
 710                                             freelist_new, counters_new);
 711         } else {
 712                 ret = __update_freelist_slow(slab, freelist_old, counters_old,
 713                                             freelist_new, counters_new);
 714         }
 715         if (likely(ret))
 716                 return true;
 717
 718         cpu_relax();
 719         stat(s, CMPXCHG_DOUBLE_FAIL);
 720
 721 #ifdef SLUB_DEBUG_CMPXCHG
 722         pr_info("%s %s: cmpxchg double redo ", n, s->name);
 723 #endif
 724
 725         return false;
 726 }
 727
 728 static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 729                 void *freelist_old, unsigned long counters_old,
 730                 void *freelist_new, unsigned long counters_new,
 731                 const char *n)
 732 {
 733         bool ret;
 734
 735         if (s->flags & __CMPXCHG_DOUBLE) {
 736                 ret = __update_freelist_fast(slab, freelist_old, counters_old,
 737                                             freelist_new, counters_new);
 738         } else {
 739                 unsigned long flags;
 740
 741                 local_irq_save(flags);
 742                 ret = __update_freelist_slow(slab, freelist_old, counters_old,
 743                                             freelist_new, counters_new);
 744                 local_irq_restore(flags);
 745         }
 746         if (likely(ret))
 747                 return true;
 748
 749         cpu_relax();
 750         stat(s, CMPXCHG_DOUBLE_FAIL);
 751
 752 #ifdef SLUB_DEBUG_CMPXCHG
 753         pr_info("%s %s: cmpxchg double redo ", n, s->name);
 754 #endif
 755
 756         return false;
 757 }
 758
 759 #ifdef CONFIG_SLUB_DEBUG
 760 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
 761 static DEFINE_SPINLOCK(object_map_lock);
 762
 763 static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
 764                        struct slab *slab)
 765 {
 766         void *addr = slab_address(slab);
 767         void *p;
 768
 769         bitmap_zero(obj_map, slab->objects);
 770
 771         for (p = slab->freelist; p; p = get_freepointer(s, p))
 772                 set_bit(__obj_to_index(s, addr, p), obj_map);
 773 }
 774
 775 #if IS_ENABLED(CONFIG_KUNIT)
 776 static bool slab_add_kunit_errors(void)
 777 {
 778         struct kunit_resource *resource;
 779
 780         if (!kunit_get_current_test())
 781                 return false;
 782
 783         resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
 784         if (!resource)
 785                 return false;
 786
 787         (*(int *)resource->data)++;
 788         kunit_put_resource(resource);
 789         return true;
 790 }
 791
 792 static bool slab_in_kunit_test(void)
 793 {
 794         struct kunit_resource *resource;
 795
 796         if (!kunit_get_current_test())
 797                 return false;
 798
 799         resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
 800         if (!resource)
 801                 return false;
 802
 803         kunit_put_resource(resource);
 804         return true;
 805 }
 806 #else
 807 static inline bool slab_add_kunit_errors(void) { return false; }
 808 static inline bool slab_in_kunit_test(void) { return false; }
 809 #endif
 810
 811 static inline unsigned int size_from_object(struct kmem_cache *s)
 812 {
 813         if (s->flags & SLAB_RED_ZONE)
 814                 return s->size - s->red_left_pad;
 815
 816         return s->size;
 817 }
 818
 819 static inline void *restore_red_left(struct kmem_cache *s, void *p)
 820 {
 821         if (s->flags & SLAB_RED_ZONE)
 822                 p -= s->red_left_pad;
 823
 824         return p;
 825 }
 826
 827 /*
 828  * Debug settings:
 829  */
 830 #if defined(CONFIG_SLUB_DEBUG_ON)
 831 static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
 832 #else
 833 static slab_flags_t slub_debug;
 834 #endif
 835
 836 static char *slub_debug_string;
 837 static int disable_higher_order_debug;
 838
 839 /*
 840  * slub is about to manipulate internal object metadata.  This memory lies
 841  * outside the range of the allocated object, so accessing it would normally
 842  * be reported by kasan as a bounds error.  metadata_access_enable() is used
 843  * to tell kasan that these accesses are OK.
 844  */
 845 static inline void metadata_access_enable(void)
 846 {
 847         kasan_disable_current();
 848         kmsan_disable_current();
 849 }
 850
 851 static inline void metadata_access_disable(void)
 852 {
 853         kmsan_enable_current();
 854         kasan_enable_current();
 855 }
 856
 857 /*
 858  * Object debugging
 859  */
 860
 861 /* Verify that a pointer has an address that is valid within a slab page */
 862 static inline int check_valid_pointer(struct kmem_cache *s,
 863                                 struct slab *slab, void *object)
 864 {
 865         void *base;
 866
 867         if (!object)
 868                 return 1;
 869
 870         base = slab_address(slab);
 871         object = kasan_reset_tag(object);
 872         object = restore_red_left(s, object);
 873         if (object < base || object >= base + slab->objects * s->size ||
 874                 (object - base) % s->size) {
 875                 return 0;
 876         }
 877
 878         return 1;
 879 }
 880
 881 static void print_section(char *level, char *text, u8 *addr,
 882                           unsigned int length)
 883 {
 884         metadata_access_enable();
 885         print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
 886                         16, 1, kasan_reset_tag((void *)addr), length, 1);
 887         metadata_access_disable();
 888 }
 889
 890 static struct track *get_track(struct kmem_cache *s, void *object,
 891         enum track_item alloc)
 892 {
 893         struct track *p;
 894
 895         p = object + get_info_end(s);
 896
 897         return kasan_reset_tag(p + alloc);
 898 }
 899
 900 #ifdef CONFIG_STACKDEPOT
 901 static noinline depot_stack_handle_t set_track_prepare(void)
 902 {
 903         depot_stack_handle_t handle;
 904         unsigned long entries[TRACK_ADDRS_COUNT];
 905         unsigned int nr_entries;
 906
 907         nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
 908         handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
 909
 910         return handle;
 911 }
 912 #else
 913 static inline depot_stack_handle_t set_track_prepare(void)
 914 {
 915         return 0;
 916 }
 917 #endif
 918
 919 static void set_track_update(struct kmem_cache *s, void *object,
 920                              enum track_item alloc, unsigned long addr,
 921                              depot_stack_handle_t handle)
 922 {
 923         struct track *p = get_track(s, object, alloc);
 924
 925 #ifdef CONFIG_STACKDEPOT
 926         p->handle = handle;
 927 #endif
 928         p->addr = addr;
 929         p->cpu = smp_processor_id();
 930         p->pid = current->pid;
 931         p->when = jiffies;
 932 }
 933
 934 static __always_inline void set_track(struct kmem_cache *s, void *object,
 935                                       enum track_item alloc, unsigned long addr)
 936 {
 937         depot_stack_handle_t handle = set_track_prepare();
 938
 939         set_track_update(s, object, alloc, addr, handle);
 940 }
 941
 942 static void init_tracking(struct kmem_cache *s, void *object)
 943 {
 944         struct track *p;
 945
 946         if (!(s->flags & SLAB_STORE_USER))
 947                 return;
 948
 949         p = get_track(s, object, TRACK_ALLOC);
 950         memset(p, 0, 2*sizeof(struct track));
 951 }
 952
 953 static void print_track(const char *s, struct track *t, unsigned long pr_time)
 954 {
 955         depot_stack_handle_t handle __maybe_unused;
 956
 957         if (!t->addr)
 958                 return;
 959
 960         pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
 961                s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 962 #ifdef CONFIG_STACKDEPOT
 963         handle = READ_ONCE(t->handle);
 964         if (handle)
 965                 stack_depot_print(handle);
 966         else
 967                 pr_err("object allocation/free stack trace missing\n");
 968 #endif
 969 }
 970
 971 void print_tracking(struct kmem_cache *s, void *object)
 972 {
 973         unsigned long pr_time = jiffies;
 974         if (!(s->flags & SLAB_STORE_USER))
 975                 return;
 976
 977         print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
 978         print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
 979 }
 980
 981 static void print_slab_info(const struct slab *slab)
 982 {
 983         pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
 984                slab, slab->objects, slab->inuse, slab->freelist,
 985                &slab->__page_flags);
 986 }
 987
 988 /*
 989  * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
 990  * family will round up the real request size to these fixed ones, so
 991  * there could be an extra area than what is requested. Save the original
 992  * request size in the meta data area, for better debug and sanity check.
 993  */
 994 static inline void set_orig_size(struct kmem_cache *s,
 995                                 void *object, unsigned int orig_size)
 996 {
 997         void *p = kasan_reset_tag(object);
 998         unsigned int kasan_meta_size;
 999
1000         if (!slub_debug_orig_size(s))
1001                 return;
1002
1003         /*
1004          * KASAN can save its free meta data inside of the object at offset 0.
1005          * If this meta data size is larger than 'orig_size', it will overlap
1006          * the data redzone in [orig_size+1, object_size]. Thus, we adjust
1007          * 'orig_size' to be as at least as big as KASAN's meta data.
1008          */
1009         kasan_meta_size = kasan_metadata_size(s, true);
1010         if (kasan_meta_size > orig_size)
1011                 orig_size = kasan_meta_size;
1012
1013         p += get_info_end(s);
1014         p += sizeof(struct track) * 2;
1015
1016         *(unsigned int *)p = orig_size;
1017 }
1018
1019 static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
1020 {
1021         void *p = kasan_reset_tag(object);
1022
1023         if (!slub_debug_orig_size(s))
1024                 return s->object_size;
1025
1026         p += get_info_end(s);
1027         p += sizeof(struct track) * 2;
1028
1029         return *(unsigned int *)p;
1030 }
1031
1032 void skip_orig_size_check(struct kmem_cache *s, const void *object)
1033 {
1034         set_orig_size(s, (void *)object, s->object_size);
1035 }
1036
1037 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
1038 {
1039         struct va_format vaf;
1040         va_list args;
1041
1042         va_start(args, fmt);
1043         vaf.fmt = fmt;
1044         vaf.va = &args;
1045         pr_err("=============================================================================\n");
1046         pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
1047         pr_err("-----------------------------------------------------------------------------\n\n");
1048         va_end(args);
1049 }
1050
1051 __printf(2, 3)
1052 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
1053 {
1054         struct va_format vaf;
1055         va_list args;
1056
1057         if (slab_add_kunit_errors())
1058                 return;
1059
1060         va_start(args, fmt);
1061         vaf.fmt = fmt;
1062         vaf.va = &args;
1063         pr_err("FIX %s: %pV\n", s->name, &vaf);
1064         va_end(args);
1065 }
1066
1067 static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
1068 {
1069         unsigned int off;       /* Offset of last byte */
1070         u8 *addr = slab_address(slab);
1071
1072         print_tracking(s, p);
1073
1074         print_slab_info(slab);
1075
1076         pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
1077                p, p - addr, get_freepointer(s, p));
1078
1079         if (s->flags & SLAB_RED_ZONE)
1080                 print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
1081                               s->red_left_pad);
1082         else if (p > addr + 16)
1083                 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
1084
1085         print_section(KERN_ERR,         "Object   ", p,
1086                       min_t(unsigned int, s->object_size, PAGE_SIZE));
1087         if (s->flags & SLAB_RED_ZONE)
1088                 print_section(KERN_ERR, "Redzone  ", p + s->object_size,
1089                         s->inuse - s->object_size);
1090
1091         off = get_info_end(s);
1092
1093         if (s->flags & SLAB_STORE_USER)
1094                 off += 2 * sizeof(struct track);
1095
1096         if (slub_debug_orig_size(s))
1097                 off += sizeof(unsigned int);
1098
1099         off += kasan_metadata_size(s, false);
1100
1101         if (off != size_from_object(s))
1102                 /* Beginning of the filler is the free pointer */
1103                 print_section(KERN_ERR, "Padding  ", p + off,
1104                               size_from_object(s) - off);
1105
1106         dump_stack();
1107 }
1108
1109 static void object_err(struct kmem_cache *s, struct slab *slab,
1110                         u8 *object, char *reason)
1111 {
1112         if (slab_add_kunit_errors())
1113                 return;
1114
1115         slab_bug(s, "%s", reason);
1116         print_trailer(s, slab, object);
1117         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1118 }
1119
1120 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1121                                void **freelist, void *nextfree)
1122 {
1123         if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
1124             !check_valid_pointer(s, slab, nextfree) && freelist) {
1125                 object_err(s, slab, *freelist, "Freechain corrupt");
1126                 *freelist = NULL;
1127                 slab_fix(s, "Isolate corrupted freechain");
1128                 return true;
1129         }
1130
1131         return false;
1132 }
1133
1134 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
1135                         const char *fmt, ...)
1136 {
1137         va_list args;
1138         char buf[100];
1139
1140         if (slab_add_kunit_errors())
1141                 return;
1142
1143         va_start(args, fmt);
1144         vsnprintf(buf, sizeof(buf), fmt, args);
1145         va_end(args);
1146         slab_bug(s, "%s", buf);
1147         print_slab_info(slab);
1148         dump_stack();
1149         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1150 }
1151
1152 static void init_object(struct kmem_cache *s, void *object, u8 val)
1153 {
1154         u8 *p = kasan_reset_tag(object);
1155         unsigned int poison_size = s->object_size;
1156
1157         if (s->flags & SLAB_RED_ZONE) {
1158                 /*
1159                  * Here and below, avoid overwriting the KMSAN shadow. Keeping
1160                  * the shadow makes it possible to distinguish uninit-value
1161                  * from use-after-free.
1162                  */
1163                 memset_no_sanitize_memory(p - s->red_left_pad, val,
1164                                           s->red_left_pad);
1165
1166                 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1167                         /*
1168                          * Redzone the extra allocated space by kmalloc than
1169                          * requested, and the poison size will be limited to
1170                          * the original request size accordingly.
1171                          */
1172                         poison_size = get_orig_size(s, object);
1173                 }
1174         }
1175
1176         if (s->flags & __OBJECT_POISON) {
1177                 memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1);
1178                 memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1);
1179         }
1180
1181         if (s->flags & SLAB_RED_ZONE)
1182                 memset_no_sanitize_memory(p + poison_size, val,
1183                                           s->inuse - poison_size);
1184 }
1185
1186 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
1187                                                 void *from, void *to)
1188 {
1189         slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
1190         memset(from, data, to - from);
1191 }
1192
1193 #ifdef CONFIG_KMSAN
1194 #define pad_check_attributes noinline __no_kmsan_checks
1195 #else
1196 #define pad_check_attributes
1197 #endif
1198
1199 static pad_check_attributes int
1200 check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
1201                        u8 *object, char *what,
1202                        u8 *start, unsigned int value, unsigned int bytes)
1203 {
1204         u8 *fault;
1205         u8 *end;
1206         u8 *addr = slab_address(slab);
1207
1208         metadata_access_enable();
1209         fault = memchr_inv(kasan_reset_tag(start), value, bytes);
1210         metadata_access_disable();
1211         if (!fault)
1212                 return 1;
1213
1214         end = start + bytes;
1215         while (end > fault && end[-1] == value)
1216                 end--;
1217
1218         if (slab_add_kunit_errors())
1219                 goto skip_bug_print;
1220
1221         slab_bug(s, "%s overwritten", what);
1222         pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
1223                                         fault, end - 1, fault - addr,
1224                                         fault[0], value);
1225
1226 skip_bug_print:
1227         restore_bytes(s, what, value, fault, end);
1228         return 0;
1229 }
1230
1231 /*
1232  * Object layout:
1233  *
1234  * object address
1235  *      Bytes of the object to be managed.
1236  *      If the freepointer may overlay the object then the free
1237  *      pointer is at the middle of the object.
1238  *
1239  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
1240  *      0xa5 (POISON_END)
1241  *
1242  * object + s->object_size
1243  *      Padding to reach word boundary. This is also used for Redzoning.
1244  *      Padding is extended by another word if Redzoning is enabled and
1245  *      object_size == inuse.
1246  *
1247  *      We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
1248  *      0xcc (SLUB_RED_ACTIVE) for objects in use.
1249  *
1250  * object + s->inuse
1251  *      Meta data starts here.
1252  *
1253  *      A. Free pointer (if we cannot overwrite object on free)
1254  *      B. Tracking data for SLAB_STORE_USER
1255  *      C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
1256  *      D. Padding to reach required alignment boundary or at minimum
1257  *              one word if debugging is on to be able to detect writes
1258  *              before the word boundary.
1259  *
1260  *      Padding is done using 0x5a (POISON_INUSE)
1261  *
1262  * object + s->size
1263  *      Nothing is used beyond s->size.
1264  *
1265  * If slabcaches are merged then the object_size and inuse boundaries are mostly
1266  * ignored. And therefore no slab options that rely on these boundaries
1267  * may be used with merged slabcaches.
1268  */
1269
1270 static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
1271 {
1272         unsigned long off = get_info_end(s);    /* The end of info */
1273
1274         if (s->flags & SLAB_STORE_USER) {
1275                 /* We also have user information there */
1276                 off += 2 * sizeof(struct track);
1277
1278                 if (s->flags & SLAB_KMALLOC)
1279                         off += sizeof(unsigned int);
1280         }
1281
1282         off += kasan_metadata_size(s, false);
1283
1284         if (size_from_object(s) == off)
1285                 return 1;
1286
1287         return check_bytes_and_report(s, slab, p, "Object padding",
1288                         p + off, POISON_INUSE, size_from_object(s) - off);
1289 }
1290
1291 /* Check the pad bytes at the end of a slab page */
1292 static pad_check_attributes void
1293 slab_pad_check(struct kmem_cache *s, struct slab *slab)
1294 {
1295         u8 *start;
1296         u8 *fault;
1297         u8 *end;
1298         u8 *pad;
1299         int length;
1300         int remainder;
1301
1302         if (!(s->flags & SLAB_POISON))
1303                 return;
1304
1305         start = slab_address(slab);
1306         length = slab_size(slab);
1307         end = start + length;
1308         remainder = length % s->size;
1309         if (!remainder)
1310                 return;
1311
1312         pad = end - remainder;
1313         metadata_access_enable();
1314         fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
1315         metadata_access_disable();
1316         if (!fault)
1317                 return;
1318         while (end > fault && end[-1] == POISON_INUSE)
1319                 end--;
1320
1321         slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
1322                         fault, end - 1, fault - start);
1323         print_section(KERN_ERR, "Padding ", pad, remainder);
1324
1325         restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
1326 }
1327
1328 static int check_object(struct kmem_cache *s, struct slab *slab,
1329                                         void *object, u8 val)
1330 {
1331         u8 *p = object;
1332         u8 *endobject = object + s->object_size;
1333         unsigned int orig_size, kasan_meta_size;
1334         int ret = 1;
1335
1336         if (s->flags & SLAB_RED_ZONE) {
1337                 if (!check_bytes_and_report(s, slab, object, "Left Redzone",
1338                         object - s->red_left_pad, val, s->red_left_pad))
1339                         ret = 0;
1340
1341                 if (!check_bytes_and_report(s, slab, object, "Right Redzone",
1342                         endobject, val, s->inuse - s->object_size))
1343                         ret = 0;
1344
1345                 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1346                         orig_size = get_orig_size(s, object);
1347
1348                         if (s->object_size > orig_size  &&
1349                                 !check_bytes_and_report(s, slab, object,
1350                                         "kmalloc Redzone", p + orig_size,
1351                                         val, s->object_size - orig_size)) {
1352                                 ret = 0;
1353                         }
1354                 }
1355         } else {
1356                 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
1357                         if (!check_bytes_and_report(s, slab, p, "Alignment padding",
1358                                 endobject, POISON_INUSE,
1359                                 s->inuse - s->object_size))
1360                                 ret = 0;
1361                 }
1362         }
1363
1364         if (s->flags & SLAB_POISON) {
1365                 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
1366                         /*
1367                          * KASAN can save its free meta data inside of the
1368                          * object at offset 0. Thus, skip checking the part of
1369                          * the redzone that overlaps with the meta data.
1370                          */
1371                         kasan_meta_size = kasan_metadata_size(s, true);
1372                         if (kasan_meta_size < s->object_size - 1 &&
1373                             !check_bytes_and_report(s, slab, p, "Poison",
1374                                         p + kasan_meta_size, POISON_FREE,
1375                                         s->object_size - kasan_meta_size - 1))
1376                                 ret = 0;
1377                         if (kasan_meta_size < s->object_size &&
1378                             !check_bytes_and_report(s, slab, p, "End Poison",
1379                                         p + s->object_size - 1, POISON_END, 1))
1380                                 ret = 0;
1381                 }
1382                 /*
1383                  * check_pad_bytes cleans up on its own.
1384                  */
1385                 if (!check_pad_bytes(s, slab, p))
1386                         ret = 0;
1387         }
1388
1389         /*
1390          * Cannot check freepointer while object is allocated if
1391          * object and freepointer overlap.
1392          */
1393         if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
1394             !check_valid_pointer(s, slab, get_freepointer(s, p))) {
1395                 object_err(s, slab, p, "Freepointer corrupt");
1396                 /*
1397                  * No choice but to zap it and thus lose the remainder
1398                  * of the free objects in this slab. May cause
1399                  * another error because the object count is now wrong.
1400                  */
1401                 set_freepointer(s, p, NULL);
1402                 ret = 0;
1403         }
1404
1405         if (!ret && !slab_in_kunit_test()) {
1406                 print_trailer(s, slab, object);
1407                 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1408         }
1409
1410         return ret;
1411 }
1412
1413 static int check_slab(struct kmem_cache *s, struct slab *slab)
1414 {
1415         int maxobj;
1416
1417         if (!folio_test_slab(slab_folio(slab))) {
1418                 slab_err(s, slab, "Not a valid slab page");
1419                 return 0;
1420         }
1421
1422         maxobj = order_objects(slab_order(slab), s->size);
1423         if (slab->objects > maxobj) {
1424                 slab_err(s, slab, "objects %u > max %u",
1425                         slab->objects, maxobj);
1426                 return 0;
1427         }
1428         if (slab->inuse > slab->objects) {
1429                 slab_err(s, slab, "inuse %u > max %u",
1430                         slab->inuse, slab->objects);
1431                 return 0;
1432         }
1433         /* Slab_pad_check fixes things up after itself */
1434         slab_pad_check(s, slab);
1435         return 1;
1436 }
1437
1438 /*
1439  * Determine if a certain object in a slab is on the freelist. Must hold the
1440  * slab lock to guarantee that the chains are in a consistent state.
1441  */
1442 static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
1443 {
1444         int nr = 0;
1445         void *fp;
1446         void *object = NULL;
1447         int max_objects;
1448
1449         fp = slab->freelist;
1450         while (fp && nr <= slab->objects) {
1451                 if (fp == search)
1452                         return 1;
1453                 if (!check_valid_pointer(s, slab, fp)) {
1454                         if (object) {
1455                                 object_err(s, slab, object,
1456                                         "Freechain corrupt");
1457                                 set_freepointer(s, object, NULL);
1458                         } else {
1459                                 slab_err(s, slab, "Freepointer corrupt");
1460                                 slab->freelist = NULL;
1461                                 slab->inuse = slab->objects;
1462                                 slab_fix(s, "Freelist cleared");
1463                                 return 0;
1464                         }
1465                         break;
1466                 }
1467                 object = fp;
1468                 fp = get_freepointer(s, object);
1469                 nr++;
1470         }
1471
1472         max_objects = order_objects(slab_order(slab), s->size);
1473         if (max_objects > MAX_OBJS_PER_PAGE)
1474                 max_objects = MAX_OBJS_PER_PAGE;
1475
1476         if (slab->objects != max_objects) {
1477                 slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
1478                          slab->objects, max_objects);
1479                 slab->objects = max_objects;
1480                 slab_fix(s, "Number of objects adjusted");
1481         }
1482         if (slab->inuse != slab->objects - nr) {
1483                 slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
1484                          slab->inuse, slab->objects - nr);
1485                 slab->inuse = slab->objects - nr;
1486                 slab_fix(s, "Object count adjusted");
1487         }
1488         return search == NULL;
1489 }
1490
1491 static void trace(struct kmem_cache *s, struct slab *slab, void *object,
1492                                                                 int alloc)
1493 {
1494         if (s->flags & SLAB_TRACE) {
1495                 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1496                         s->name,
1497                         alloc ? "alloc" : "free",
1498                         object, slab->inuse,
1499                         slab->freelist);
1500
1501                 if (!alloc)
1502                         print_section(KERN_INFO, "Object ", (void *)object,
1503                                         s->object_size);
1504
1505                 dump_stack();
1506         }
1507 }
1508
1509 /*
1510  * Tracking of fully allocated slabs for debugging purposes.
1511  */
1512 static void add_full(struct kmem_cache *s,
1513         struct kmem_cache_node *n, struct slab *slab)
1514 {
1515         if (!(s->flags & SLAB_STORE_USER))
1516                 return;
1517
1518         lockdep_assert_held(&n->list_lock);
1519         list_add(&slab->slab_list, &n->full);
1520 }
1521
1522 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
1523 {
1524         if (!(s->flags & SLAB_STORE_USER))
1525                 return;
1526
1527         lockdep_assert_held(&n->list_lock);
1528         list_del(&slab->slab_list);
1529 }
1530
1531 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1532 {
1533         return atomic_long_read(&n->nr_slabs);
1534 }
1535
1536 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1537 {
1538         struct kmem_cache_node *n = get_node(s, node);
1539
1540         atomic_long_inc(&n->nr_slabs);
1541         atomic_long_add(objects, &n->total_objects);
1542 }
1543 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1544 {
1545         struct kmem_cache_node *n = get_node(s, node);
1546
1547         atomic_long_dec(&n->nr_slabs);
1548         atomic_long_sub(objects, &n->total_objects);
1549 }
1550
1551 /* Object debug checks for alloc/free paths */
1552 static void setup_object_debug(struct kmem_cache *s, void *object)
1553 {
1554         if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
1555                 return;
1556
1557         init_object(s, object, SLUB_RED_INACTIVE);
1558         init_tracking(s, object);
1559 }
1560
1561 static
1562 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
1563 {
1564         if (!kmem_cache_debug_flags(s, SLAB_POISON))
1565                 return;
1566
1567         metadata_access_enable();
1568         memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
1569         metadata_access_disable();
1570 }
1571
1572 static inline int alloc_consistency_checks(struct kmem_cache *s,
1573                                         struct slab *slab, void *object)
1574 {
1575         if (!check_slab(s, slab))
1576                 return 0;
1577
1578         if (!check_valid_pointer(s, slab, object)) {
1579                 object_err(s, slab, object, "Freelist Pointer check fails");
1580                 return 0;
1581         }
1582
1583         if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
1584                 return 0;
1585
1586         return 1;
1587 }
1588
1589 static noinline bool alloc_debug_processing(struct kmem_cache *s,
1590                         struct slab *slab, void *object, int orig_size)
1591 {
1592         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1593                 if (!alloc_consistency_checks(s, slab, object))
1594                         goto bad;
1595         }
1596
1597         /* Success. Perform special debug activities for allocs */
1598         trace(s, slab, object, 1);
1599         set_orig_size(s, object, orig_size);
1600         init_object(s, object, SLUB_RED_ACTIVE);
1601         return true;
1602
1603 bad:
1604         if (folio_test_slab(slab_folio(slab))) {
1605                 /*
1606                  * If this is a slab page then lets do the best we can
1607                  * to avoid issues in the future. Marking all objects
1608                  * as used avoids touching the remaining objects.
1609                  */
1610                 slab_fix(s, "Marking all objects used");
1611                 slab->inuse = slab->objects;
1612                 slab->freelist = NULL;
1613         }
1614         return false;
1615 }
1616
1617 static inline int free_consistency_checks(struct kmem_cache *s,
1618                 struct slab *slab, void *object, unsigned long addr)
1619 {
1620         if (!check_valid_pointer(s, slab, object)) {
1621                 slab_err(s, slab, "Invalid object pointer 0x%p", object);
1622                 return 0;
1623         }
1624
1625         if (on_freelist(s, slab, object)) {
1626                 object_err(s, slab, object, "Object already free");
1627                 return 0;
1628         }
1629
1630         if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
1631                 return 0;
1632
1633         if (unlikely(s != slab->slab_cache)) {
1634                 if (!folio_test_slab(slab_folio(slab))) {
1635                         slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
1636                                  object);
1637                 } else if (!slab->slab_cache) {
1638                         pr_err("SLUB <none>: no slab for object 0x%p.\n",
1639                                object);
1640                         dump_stack();
1641                 } else
1642                         object_err(s, slab, object,
1643                                         "page slab pointer corrupt.");
1644                 return 0;
1645         }
1646         return 1;
1647 }
1648
1649 /*
1650  * Parse a block of slab_debug options. Blocks are delimited by ';'
1651  *
1652  * @str:    start of block
1653  * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1654  * @slabs:  return start of list of slabs, or NULL when there's no list
1655  * @init:   assume this is initial parsing and not per-kmem-create parsing
1656  *
1657  * returns the start of next block if there's any, or NULL
1658  */
1659 static char *
1660 parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
1661 {
1662         bool higher_order_disable = false;
1663
1664         /* Skip any completely empty blocks */
1665         while (*str && *str == ';')
1666                 str++;
1667
1668         if (*str == ',') {
1669                 /*
1670                  * No options but restriction on slabs. This means full
1671                  * debugging for slabs matching a pattern.
1672                  */
1673                 *flags = DEBUG_DEFAULT_FLAGS;
1674                 goto check_slabs;
1675         }
1676         *flags = 0;
1677
1678         /* Determine which debug features should be switched on */
1679         for (; *str && *str != ',' && *str != ';'; str++) {
1680                 switch (tolower(*str)) {
1681                 case '-':
1682                         *flags = 0;
1683                         break;
1684                 case 'f':
1685                         *flags |= SLAB_CONSISTENCY_CHECKS;
1686                         break;
1687                 case 'z':
1688                         *flags |= SLAB_RED_ZONE;
1689                         break;
1690                 case 'p':
1691                         *flags |= SLAB_POISON;
1692                         break;
1693                 case 'u':
1694                         *flags |= SLAB_STORE_USER;
1695                         break;
1696                 case 't':
1697                         *flags |= SLAB_TRACE;
1698                         break;
1699                 case 'a':
1700                         *flags |= SLAB_FAILSLAB;
1701                         break;
1702                 case 'o':
1703                         /*
1704                          * Avoid enabling debugging on caches if its minimum
1705                          * order would increase as a result.
1706                          */
1707                         higher_order_disable = true;
1708                         break;
1709                 default:
1710                         if (init)
1711                                 pr_err("slab_debug option '%c' unknown. skipped\n", *str);
1712                 }
1713         }
1714 check_slabs:
1715         if (*str == ',')
1716                 *slabs = ++str;
1717         else
1718                 *slabs = NULL;
1719
1720         /* Skip over the slab list */
1721         while (*str && *str != ';')
1722                 str++;
1723
1724         /* Skip any completely empty blocks */
1725         while (*str && *str == ';')
1726                 str++;
1727
1728         if (init && higher_order_disable)
1729                 disable_higher_order_debug = 1;
1730
1731         if (*str)
1732                 return str;
1733         else
1734                 return NULL;
1735 }
1736
1737 static int __init setup_slub_debug(char *str)
1738 {
1739         slab_flags_t flags;
1740         slab_flags_t global_flags;
1741         char *saved_str;
1742         char *slab_list;
1743         bool global_slub_debug_changed = false;
1744         bool slab_list_specified = false;
1745
1746         global_flags = DEBUG_DEFAULT_FLAGS;
1747         if (*str++ != '=' || !*str)
1748                 /*
1749                  * No options specified. Switch on full debugging.
1750                  */
1751                 goto out;
1752
1753         saved_str = str;
1754         while (str) {
1755                 str = parse_slub_debug_flags(str, &flags, &slab_list, true);
1756
1757                 if (!slab_list) {
1758                         global_flags = flags;
1759                         global_slub_debug_changed = true;
1760                 } else {
1761                         slab_list_specified = true;
1762                         if (flags & SLAB_STORE_USER)
1763                                 stack_depot_request_early_init();
1764                 }
1765         }
1766
1767         /*
1768          * For backwards compatibility, a single list of flags with list of
1769          * slabs means debugging is only changed for those slabs, so the global
1770          * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1771          * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1772          * long as there is no option specifying flags without a slab list.
1773          */
1774         if (slab_list_specified) {
1775                 if (!global_slub_debug_changed)
1776                         global_flags = slub_debug;
1777                 slub_debug_string = saved_str;
1778         }
1779 out:
1780         slub_debug = global_flags;
1781         if (slub_debug & SLAB_STORE_USER)
1782                 stack_depot_request_early_init();
1783         if (slub_debug != 0 || slub_debug_string)
1784                 static_branch_enable(&slub_debug_enabled);
1785         else
1786                 static_branch_disable(&slub_debug_enabled);
1787         if ((static_branch_unlikely(&init_on_alloc) ||
1788              static_branch_unlikely(&init_on_free)) &&
1789             (slub_debug & SLAB_POISON))
1790                 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1791         return 1;
1792 }
1793
1794 __setup("slab_debug", setup_slub_debug);
1795 __setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
1796
1797 /*
1798  * kmem_cache_flags - apply debugging options to the cache
1799  * @flags:              flags to set
1800  * @name:               name of the cache
1801  *
1802  * Debug option(s) are applied to @flags. In addition to the debug
1803  * option(s), if a slab name (or multiple) is specified i.e.
1804  * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1805  * then only the select slabs will receive the debug option(s).
1806  */
1807 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
1808 {
1809         char *iter;
1810         size_t len;
1811         char *next_block;
1812         slab_flags_t block_flags;
1813         slab_flags_t slub_debug_local = slub_debug;
1814
1815         if (flags & SLAB_NO_USER_FLAGS)
1816                 return flags;
1817
1818         /*
1819          * If the slab cache is for debugging (e.g. kmemleak) then
1820          * don't store user (stack trace) information by default,
1821          * but let the user enable it via the command line below.
1822          */
1823         if (flags & SLAB_NOLEAKTRACE)
1824                 slub_debug_local &= ~SLAB_STORE_USER;
1825
1826         len = strlen(name);
1827         next_block = slub_debug_string;
1828         /* Go through all blocks of debug options, see if any matches our slab's name */
1829         while (next_block) {
1830                 next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
1831                 if (!iter)
1832                         continue;
1833                 /* Found a block that has a slab list, search it */
1834                 while (*iter) {
1835                         char *end, *glob;
1836                         size_t cmplen;
1837
1838                         end = strchrnul(iter, ',');
1839                         if (next_block && next_block < end)
1840                                 end = next_block - 1;
1841
1842                         glob = strnchr(iter, end - iter, '*');
1843                         if (glob)
1844                                 cmplen = glob - iter;
1845                         else
1846                                 cmplen = max_t(size_t, len, (end - iter));
1847
1848                         if (!strncmp(name, iter, cmplen)) {
1849                                 flags |= block_flags;
1850                                 return flags;
1851                         }
1852
1853                         if (!*end || *end == ';')
1854                                 break;
1855                         iter = end + 1;
1856                 }
1857         }
1858
1859         return flags | slub_debug_local;
1860 }
1861 #else /* !CONFIG_SLUB_DEBUG */
1862 static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
1863 static inline
1864 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
1865
1866 static inline bool alloc_debug_processing(struct kmem_cache *s,
1867         struct slab *slab, void *object, int orig_size) { return true; }
1868
1869 static inline bool free_debug_processing(struct kmem_cache *s,
1870         struct slab *slab, void *head, void *tail, int *bulk_cnt,
1871         unsigned long addr, depot_stack_handle_t handle) { return true; }
1872
1873 static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
1874 static inline int check_object(struct kmem_cache *s, struct slab *slab,
1875                         void *object, u8 val) { return 1; }
1876 static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
1877 static inline void set_track(struct kmem_cache *s, void *object,
1878                              enum track_item alloc, unsigned long addr) {}
1879 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1880                                         struct slab *slab) {}
1881 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1882                                         struct slab *slab) {}
1883 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
1884 {
1885         return flags;
1886 }
1887 #define slub_debug 0
1888
1889 #define disable_higher_order_debug 0
1890
1891 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1892                                                         { return 0; }
1893 static inline void inc_slabs_node(struct kmem_cache *s, int node,
1894                                                         int objects) {}
1895 static inline void dec_slabs_node(struct kmem_cache *s, int node,
1896                                                         int objects) {}
1897
1898 #ifndef CONFIG_SLUB_TINY
1899 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1900                                void **freelist, void *nextfree)
1901 {
1902         return false;
1903 }
1904 #endif
1905 #endif /* CONFIG_SLUB_DEBUG */
1906
1907 #ifdef CONFIG_SLAB_OBJ_EXT
1908
1909 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
1910
1911 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
1912 {
1913         struct slabobj_ext *slab_exts;
1914         struct slab *obj_exts_slab;
1915
1916         obj_exts_slab = virt_to_slab(obj_exts);
1917         slab_exts = slab_obj_exts(obj_exts_slab);
1918         if (slab_exts) {
1919                 unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
1920                                                  obj_exts_slab, obj_exts);
1921                 /* codetag should be NULL */
1922                 WARN_ON(slab_exts[offs].ref.ct);
1923                 set_codetag_empty(&slab_exts[offs].ref);
1924         }
1925 }
1926
1927 static inline void mark_failed_objexts_alloc(struct slab *slab)
1928 {
1929         slab->obj_exts = OBJEXTS_ALLOC_FAIL;
1930 }
1931
1932 static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
1933                         struct slabobj_ext *vec, unsigned int objects)
1934 {
1935         /*
1936          * If vector previously failed to allocate then we have live
1937          * objects with no tag reference. Mark all references in this
1938          * vector as empty to avoid warnings later on.
1939          */
1940         if (obj_exts & OBJEXTS_ALLOC_FAIL) {
1941                 unsigned int i;
1942
1943                 for (i = 0; i < objects; i++)
1944                         set_codetag_empty(&vec[i].ref);
1945         }
1946 }
1947
1948 #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
1949
1950 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
1951 static inline void mark_failed_objexts_alloc(struct slab *slab) {}
1952 static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
1953                         struct slabobj_ext *vec, unsigned int objects) {}
1954
1955 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
1956
1957 /*
1958  * The allocated objcg pointers array is not accounted directly.
1959  * Moreover, it should not come from DMA buffer and is not readily
1960  * reclaimable. So those GFP bits should be masked off.
1961  */
1962 #define OBJCGS_CLEAR_MASK       (__GFP_DMA | __GFP_RECLAIMABLE | \
1963                                 __GFP_ACCOUNT | __GFP_NOFAIL)
1964
1965 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
1966                         gfp_t gfp, bool new_slab)
1967 {
1968         unsigned int objects = objs_per_slab(s, slab);
1969         unsigned long new_exts;
1970         unsigned long old_exts;
1971         struct slabobj_ext *vec;
1972
1973         gfp &= ~OBJCGS_CLEAR_MASK;
1974         /* Prevent recursive extension vector allocation */
1975         gfp |= __GFP_NO_OBJ_EXT;
1976         vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
1977                            slab_nid(slab));
1978         if (!vec) {
1979                 /* Mark vectors which failed to allocate */
1980                 if (new_slab)
1981                         mark_failed_objexts_alloc(slab);
1982
1983                 return -ENOMEM;
1984         }
1985
1986         new_exts = (unsigned long)vec;
1987 #ifdef CONFIG_MEMCG
1988         new_exts |= MEMCG_DATA_OBJEXTS;
1989 #endif
1990         old_exts = READ_ONCE(slab->obj_exts);
1991         handle_failed_objexts_alloc(old_exts, vec, objects);
1992         if (new_slab) {
1993                 /*
1994                  * If the slab is brand new and nobody can yet access its
1995                  * obj_exts, no synchronization is required and obj_exts can
1996                  * be simply assigned.
1997                  */
1998                 slab->obj_exts = new_exts;
1999         } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) ||
2000                    cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
2001                 /*
2002                  * If the slab is already in use, somebody can allocate and
2003                  * assign slabobj_exts in parallel. In this case the existing
2004                  * objcg vector should be reused.
2005                  */
2006                 mark_objexts_empty(vec);
2007                 kfree(vec);
2008                 return 0;
2009         }
2010
2011         kmemleak_not_leak(vec);
2012         return 0;
2013 }
2014
2015 static inline void free_slab_obj_exts(struct slab *slab)
2016 {
2017         struct slabobj_ext *obj_exts;
2018
2019         obj_exts = slab_obj_exts(slab);
2020         if (!obj_exts)
2021                 return;
2022
2023         /*
2024          * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
2025          * corresponding extension will be NULL. alloc_tag_sub() will throw a
2026          * warning if slab has extensions but the extension of an object is
2027          * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
2028          * the extension for obj_exts is expected to be NULL.
2029          */
2030         mark_objexts_empty(obj_exts);
2031         kfree(obj_exts);
2032         slab->obj_exts = 0;
2033 }
2034
2035 static inline bool need_slab_obj_ext(void)
2036 {
2037         if (mem_alloc_profiling_enabled())
2038                 return true;
2039
2040         /*
2041          * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
2042          * inside memcg_slab_post_alloc_hook. No other users for now.
2043          */
2044         return false;
2045 }
2046
2047 #else /* CONFIG_SLAB_OBJ_EXT */
2048
2049 static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
2050                                gfp_t gfp, bool new_slab)
2051 {
2052         return 0;
2053 }
2054
2055 static inline void free_slab_obj_exts(struct slab *slab)
2056 {
2057 }
2058
2059 static inline bool need_slab_obj_ext(void)
2060 {
2061         return false;
2062 }
2063
2064 #endif /* CONFIG_SLAB_OBJ_EXT */
2065
2066 #ifdef CONFIG_MEM_ALLOC_PROFILING
2067
2068 static inline struct slabobj_ext *
2069 prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
2070 {
2071         struct slab *slab;
2072
2073         if (!p)
2074                 return NULL;
2075
2076         if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2077                 return NULL;
2078
2079         if (flags & __GFP_NO_OBJ_EXT)
2080                 return NULL;
2081
2082         slab = virt_to_slab(p);
2083         if (!slab_obj_exts(slab) &&
2084             WARN(alloc_slab_obj_exts(slab, s, flags, false),
2085                  "%s, %s: Failed to create slab extension vector!\n",
2086                  __func__, s->name))
2087                 return NULL;
2088
2089         return slab_obj_exts(slab) + obj_to_index(s, slab, p);
2090 }
2091
2092 static inline void
2093 alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2094 {
2095         if (need_slab_obj_ext()) {
2096                 struct slabobj_ext *obj_exts;
2097
2098                 obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
2099                 /*
2100                  * Currently obj_exts is used only for allocation profiling.
2101                  * If other users appear then mem_alloc_profiling_enabled()
2102                  * check should be added before alloc_tag_add().
2103                  */
2104                 if (likely(obj_exts))
2105                         alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
2106         }
2107 }
2108
2109 static inline void
2110 alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2111                              int objects)
2112 {
2113         struct slabobj_ext *obj_exts;
2114         int i;
2115
2116         if (!mem_alloc_profiling_enabled())
2117                 return;
2118
2119         obj_exts = slab_obj_exts(slab);
2120         if (!obj_exts)
2121                 return;
2122
2123         for (i = 0; i < objects; i++) {
2124                 unsigned int off = obj_to_index(s, slab, p[i]);
2125
2126                 alloc_tag_sub(&obj_exts[off].ref, s->size);
2127         }
2128 }
2129
2130 #else /* CONFIG_MEM_ALLOC_PROFILING */
2131
2132 static inline void
2133 alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2134 {
2135 }
2136
2137 static inline void
2138 alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2139                              int objects)
2140 {
2141 }
2142
2143 #endif /* CONFIG_MEM_ALLOC_PROFILING */
2144
2145
2146 #ifdef CONFIG_MEMCG
2147
2148 static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
2149
2150 static __fastpath_inline
2151 bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2152                                 gfp_t flags, size_t size, void **p)
2153 {
2154         if (likely(!memcg_kmem_online()))
2155                 return true;
2156
2157         if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
2158                 return true;
2159
2160         if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
2161                 return true;
2162
2163         if (likely(size == 1)) {
2164                 memcg_alloc_abort_single(s, *p);
2165                 *p = NULL;
2166         } else {
2167                 kmem_cache_free_bulk(s, size, p);
2168         }
2169
2170         return false;
2171 }
2172
2173 static __fastpath_inline
2174 void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2175                           int objects)
2176 {
2177         struct slabobj_ext *obj_exts;
2178
2179         if (!memcg_kmem_online())
2180                 return;
2181
2182         obj_exts = slab_obj_exts(slab);
2183         if (likely(!obj_exts))
2184                 return;
2185
2186         __memcg_slab_free_hook(s, slab, p, objects, obj_exts);
2187 }
2188 #else /* CONFIG_MEMCG */
2189 static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
2190                                               struct list_lru *lru,
2191                                               gfp_t flags, size_t size,
2192                                               void **p)
2193 {
2194         return true;
2195 }
2196
2197 static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
2198                                         void **p, int objects)
2199 {
2200 }
2201 #endif /* CONFIG_MEMCG */
2202
2203 /*
2204  * Hooks for other subsystems that check memory allocations. In a typical
2205  * production configuration these hooks all should produce no code at all.
2206  *
2207  * Returns true if freeing of the object can proceed, false if its reuse
2208  * was delayed by KASAN quarantine, or it was returned to KFENCE.
2209  */
2210 static __always_inline
2211 bool slab_free_hook(struct kmem_cache *s, void *x, bool init)
2212 {
2213         kmemleak_free_recursive(x, s->flags);
2214         kmsan_slab_free(s, x);
2215
2216         debug_check_no_locks_freed(x, s->object_size);
2217
2218         if (!(s->flags & SLAB_DEBUG_OBJECTS))
2219                 debug_check_no_obj_freed(x, s->object_size);
2220
2221         /* Use KCSAN to help debug racy use-after-free. */
2222         if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
2223                 __kcsan_check_access(x, s->object_size,
2224                                      KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
2225
2226         if (kfence_free(x))
2227                 return false;
2228
2229         /*
2230          * As memory initialization might be integrated into KASAN,
2231          * kasan_slab_free and initialization memset's must be
2232          * kept together to avoid discrepancies in behavior.
2233          *
2234          * The initialization memset's clear the object and the metadata,
2235          * but don't touch the SLAB redzone.
2236          *
2237          * The object's freepointer is also avoided if stored outside the
2238          * object.
2239          */
2240         if (unlikely(init)) {
2241                 int rsize;
2242                 unsigned int inuse;
2243
2244                 inuse = get_info_end(s);
2245                 if (!kasan_has_integrated_init())
2246                         memset(kasan_reset_tag(x), 0, s->object_size);
2247                 rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
2248                 memset((char *)kasan_reset_tag(x) + inuse, 0,
2249                        s->size - inuse - rsize);
2250         }
2251         /* KASAN might put x into memory quarantine, delaying its reuse. */
2252         return !kasan_slab_free(s, x, init);
2253 }
2254
2255 static __fastpath_inline
2256 bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
2257                              int *cnt)
2258 {
2259
2260         void *object;
2261         void *next = *head;
2262         void *old_tail = *tail;
2263         bool init;
2264
2265         if (is_kfence_address(next)) {
2266                 slab_free_hook(s, next, false);
2267                 return false;
2268         }
2269
2270         /* Head and tail of the reconstructed freelist */
2271         *head = NULL;
2272         *tail = NULL;
2273
2274         init = slab_want_init_on_free(s);
2275
2276         do {
2277                 object = next;
2278                 next = get_freepointer(s, object);
2279
2280                 /* If object's reuse doesn't have to be delayed */
2281                 if (likely(slab_free_hook(s, object, init))) {
2282                         /* Move object to the new freelist */
2283                         set_freepointer(s, object, *head);
2284                         *head = object;
2285                         if (!*tail)
2286                                 *tail = object;
2287                 } else {
2288                         /*
2289                          * Adjust the reconstructed freelist depth
2290                          * accordingly if object's reuse is delayed.
2291                          */
2292                         --(*cnt);
2293                 }
2294         } while (object != old_tail);
2295
2296         return *head != NULL;
2297 }
2298
2299 static void *setup_object(struct kmem_cache *s, void *object)
2300 {
2301         setup_object_debug(s, object);
2302         object = kasan_init_slab_obj(s, object);
2303         if (unlikely(s->ctor)) {
2304                 kasan_unpoison_new_object(s, object);
2305                 s->ctor(object);
2306                 kasan_poison_new_object(s, object);
2307         }
2308         return object;
2309 }
2310
2311 /*
2312  * Slab allocation and freeing
2313  */
2314 static inline struct slab *alloc_slab_page(gfp_t flags, int node,
2315                 struct kmem_cache_order_objects oo)
2316 {
2317         struct folio *folio;
2318         struct slab *slab;
2319         unsigned int order = oo_order(oo);
2320
2321         folio = (struct folio *)alloc_pages_node(node, flags, order);
2322         if (!folio)
2323                 return NULL;
2324
2325         slab = folio_slab(folio);
2326         __folio_set_slab(folio);
2327         /* Make the flag visible before any changes to folio->mapping */
2328         smp_wmb();
2329         if (folio_is_pfmemalloc(folio))
2330                 slab_set_pfmemalloc(slab);
2331
2332         return slab;
2333 }
2334
2335 #ifdef CONFIG_SLAB_FREELIST_RANDOM
2336 /* Pre-initialize the random sequence cache */
2337 static int init_cache_random_seq(struct kmem_cache *s)
2338 {
2339         unsigned int count = oo_objects(s->oo);
2340         int err;
2341
2342         /* Bailout if already initialised */
2343         if (s->random_seq)
2344                 return 0;
2345
2346         err = cache_random_seq_create(s, count, GFP_KERNEL);
2347         if (err) {
2348                 pr_err("SLUB: Unable to initialize free list for %s\n",
2349                         s->name);
2350                 return err;
2351         }
2352
2353         /* Transform to an offset on the set of pages */
2354         if (s->random_seq) {
2355                 unsigned int i;
2356
2357                 for (i = 0; i < count; i++)
2358                         s->random_seq[i] *= s->size;
2359         }
2360         return 0;
2361 }
2362
2363 /* Initialize each random sequence freelist per cache */
2364 static void __init init_freelist_randomization(void)
2365 {
2366         struct kmem_cache *s;
2367
2368         mutex_lock(&slab_mutex);
2369
2370         list_for_each_entry(s, &slab_caches, list)
2371                 init_cache_random_seq(s);
2372
2373         mutex_unlock(&slab_mutex);
2374 }
2375
2376 /* Get the next entry on the pre-computed freelist randomized */
2377 static void *next_freelist_entry(struct kmem_cache *s,
2378                                 unsigned long *pos, void *start,
2379                                 unsigned long page_limit,
2380                                 unsigned long freelist_count)
2381 {
2382         unsigned int idx;
2383
2384         /*
2385          * If the target page allocation failed, the number of objects on the
2386          * page might be smaller than the usual size defined by the cache.
2387          */
2388         do {
2389                 idx = s->random_seq[*pos];
2390                 *pos += 1;
2391                 if (*pos >= freelist_count)
2392                         *pos = 0;
2393         } while (unlikely(idx >= page_limit));
2394
2395         return (char *)start + idx;
2396 }
2397
2398 /* Shuffle the single linked freelist based on a random pre-computed sequence */
2399 static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
2400 {
2401         void *start;
2402         void *cur;
2403         void *next;
2404         unsigned long idx, pos, page_limit, freelist_count;
2405
2406         if (slab->objects < 2 || !s->random_seq)
2407                 return false;
2408
2409         freelist_count = oo_objects(s->oo);
2410         pos = get_random_u32_below(freelist_count);
2411
2412         page_limit = slab->objects * s->size;
2413         start = fixup_red_left(s, slab_address(slab));
2414
2415         /* First entry is used as the base of the freelist */
2416         cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
2417         cur = setup_object(s, cur);
2418         slab->freelist = cur;
2419
2420         for (idx = 1; idx < slab->objects; idx++) {
2421                 next = next_freelist_entry(s, &pos, start, page_limit,
2422                         freelist_count);
2423                 next = setup_object(s, next);
2424                 set_freepointer(s, cur, next);
2425                 cur = next;
2426         }
2427         set_freepointer(s, cur, NULL);
2428
2429         return true;
2430 }
2431 #else
2432 static inline int init_cache_random_seq(struct kmem_cache *s)
2433 {
2434         return 0;
2435 }
2436 static inline void init_freelist_randomization(void) { }
2437 static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
2438 {
2439         return false;
2440 }
2441 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
2442
2443 static __always_inline void account_slab(struct slab *slab, int order,
2444                                          struct kmem_cache *s, gfp_t gfp)
2445 {
2446         if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
2447                 alloc_slab_obj_exts(slab, s, gfp, true);
2448
2449         mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
2450                             PAGE_SIZE << order);
2451 }
2452
2453 static __always_inline void unaccount_slab(struct slab *slab, int order,
2454                                            struct kmem_cache *s)
2455 {
2456         if (memcg_kmem_online() || need_slab_obj_ext())
2457                 free_slab_obj_exts(slab);
2458
2459         mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
2460                             -(PAGE_SIZE << order));
2461 }
2462
2463 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
2464 {
2465         struct slab *slab;
2466         struct kmem_cache_order_objects oo = s->oo;
2467         gfp_t alloc_gfp;
2468         void *start, *p, *next;
2469         int idx;
2470         bool shuffle;
2471
2472         flags &= gfp_allowed_mask;
2473
2474         flags |= s->allocflags;
2475
2476         /*
2477          * Let the initial higher-order allocation fail under memory pressure
2478          * so we fall-back to the minimum order allocation.
2479          */
2480         alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
2481         if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
2482                 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
2483
2484         slab = alloc_slab_page(alloc_gfp, node, oo);
2485         if (unlikely(!slab)) {
2486                 oo = s->min;
2487                 alloc_gfp = flags;
2488                 /*
2489                  * Allocation may have failed due to fragmentation.
2490                  * Try a lower order alloc if possible
2491                  */
2492                 slab = alloc_slab_page(alloc_gfp, node, oo);
2493                 if (unlikely(!slab))
2494                         return NULL;
2495                 stat(s, ORDER_FALLBACK);
2496         }
2497
2498         slab->objects = oo_objects(oo);
2499         slab->inuse = 0;
2500         slab->frozen = 0;
2501
2502         account_slab(slab, oo_order(oo), s, flags);
2503
2504         slab->slab_cache = s;
2505
2506         kasan_poison_slab(slab);
2507
2508         start = slab_address(slab);
2509
2510         setup_slab_debug(s, slab, start);
2511
2512         shuffle = shuffle_freelist(s, slab);
2513
2514         if (!shuffle) {
2515                 start = fixup_red_left(s, start);
2516                 start = setup_object(s, start);
2517                 slab->freelist = start;
2518                 for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
2519                         next = p + s->size;
2520                         next = setup_object(s, next);
2521                         set_freepointer(s, p, next);
2522                         p = next;
2523                 }
2524                 set_freepointer(s, p, NULL);
2525         }
2526
2527         return slab;
2528 }
2529
2530 static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
2531 {
2532         if (unlikely(flags & GFP_SLAB_BUG_MASK))
2533                 flags = kmalloc_fix_flags(flags);
2534
2535         WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
2536
2537         return allocate_slab(s,
2538                 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
2539 }
2540
2541 static void __free_slab(struct kmem_cache *s, struct slab *slab)
2542 {
2543         struct folio *folio = slab_folio(slab);
2544         int order = folio_order(folio);
2545         int pages = 1 << order;
2546
2547         __slab_clear_pfmemalloc(slab);
2548         folio->mapping = NULL;
2549         /* Make the mapping reset visible before clearing the flag */
2550         smp_wmb();
2551         __folio_clear_slab(folio);
2552         mm_account_reclaimed_pages(pages);
2553         unaccount_slab(slab, order, s);
2554         __free_pages(&folio->page, order);
2555 }
2556
2557 static void rcu_free_slab(struct rcu_head *h)
2558 {
2559         struct slab *slab = container_of(h, struct slab, rcu_head);
2560
2561         __free_slab(slab->slab_cache, slab);
2562 }
2563
2564 static void free_slab(struct kmem_cache *s, struct slab *slab)
2565 {
2566         if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
2567                 void *p;
2568
2569                 slab_pad_check(s, slab);
2570                 for_each_object(p, s, slab_address(slab), slab->objects)
2571                         check_object(s, slab, p, SLUB_RED_INACTIVE);
2572         }
2573
2574         if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
2575                 call_rcu(&slab->rcu_head, rcu_free_slab);
2576         else
2577                 __free_slab(s, slab);
2578 }
2579
2580 static void discard_slab(struct kmem_cache *s, struct slab *slab)
2581 {
2582         dec_slabs_node(s, slab_nid(slab), slab->objects);
2583         free_slab(s, slab);
2584 }
2585
2586 /*
2587  * SLUB reuses PG_workingset bit to keep track of whether it's on
2588  * the per-node partial list.
2589  */
2590 static inline bool slab_test_node_partial(const struct slab *slab)
2591 {
2592         return folio_test_workingset(slab_folio(slab));
2593 }
2594
2595 static inline void slab_set_node_partial(struct slab *slab)
2596 {
2597         set_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
2598 }
2599
2600 static inline void slab_clear_node_partial(struct slab *slab)
2601 {
2602         clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
2603 }
2604
2605 /*
2606  * Management of partially allocated slabs.
2607  */
2608 static inline void
2609 __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
2610 {
2611         n->nr_partial++;
2612         if (tail == DEACTIVATE_TO_TAIL)
2613                 list_add_tail(&slab->slab_list, &n->partial);
2614         else
2615                 list_add(&slab->slab_list, &n->partial);
2616         slab_set_node_partial(slab);
2617 }
2618
2619 static inline void add_partial(struct kmem_cache_node *n,
2620                                 struct slab *slab, int tail)
2621 {
2622         lockdep_assert_held(&n->list_lock);
2623         __add_partial(n, slab, tail);
2624 }
2625
2626 static inline void remove_partial(struct kmem_cache_node *n,
2627                                         struct slab *slab)
2628 {
2629         lockdep_assert_held(&n->list_lock);
2630         list_del(&slab->slab_list);
2631         slab_clear_node_partial(slab);
2632         n->nr_partial--;
2633 }
2634
2635 /*
2636  * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
2637  * slab from the n->partial list. Remove only a single object from the slab, do
2638  * the alloc_debug_processing() checks and leave the slab on the list, or move
2639  * it to full list if it was the last free object.
2640  */
2641 static void *alloc_single_from_partial(struct kmem_cache *s,
2642                 struct kmem_cache_node *n, struct slab *slab, int orig_size)
2643 {
2644         void *object;
2645
2646         lockdep_assert_held(&n->list_lock);
2647
2648         object = slab->freelist;
2649         slab->freelist = get_freepointer(s, object);
2650         slab->inuse++;
2651
2652         if (!alloc_debug_processing(s, slab, object, orig_size)) {
2653                 remove_partial(n, slab);
2654                 return NULL;
2655         }
2656
2657         if (slab->inuse == slab->objects) {
2658                 remove_partial(n, slab);
2659                 add_full(s, n, slab);
2660         }
2661
2662         return object;
2663 }
2664
2665 /*
2666  * Called only for kmem_cache_debug() caches to allocate from a freshly
2667  * allocated slab. Allocate a single object instead of whole freelist
2668  * and put the slab to the partial (or full) list.
2669  */
2670 static void *alloc_single_from_new_slab(struct kmem_cache *s,
2671                                         struct slab *slab, int orig_size)
2672 {
2673         int nid = slab_nid(slab);
2674         struct kmem_cache_node *n = get_node(s, nid);
2675         unsigned long flags;
2676         void *object;
2677
2678
2679         object = slab->freelist;
2680         slab->freelist = get_freepointer(s, object);
2681         slab->inuse = 1;
2682
2683         if (!alloc_debug_processing(s, slab, object, orig_size))
2684                 /*
2685                  * It's not really expected that this would fail on a
2686                  * freshly allocated slab, but a concurrent memory
2687                  * corruption in theory could cause that.
2688                  */
2689                 return NULL;
2690
2691         spin_lock_irqsave(&n->list_lock, flags);
2692
2693         if (slab->inuse == slab->objects)
2694                 add_full(s, n, slab);
2695         else
2696                 add_partial(n, slab, DEACTIVATE_TO_HEAD);
2697
2698         inc_slabs_node(s, nid, slab->objects);
2699         spin_unlock_irqrestore(&n->list_lock, flags);
2700
2701         return object;
2702 }
2703
2704 #ifdef CONFIG_SLUB_CPU_PARTIAL
2705 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
2706 #else
2707 static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
2708                                    int drain) { }
2709 #endif
2710 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
2711
2712 /*
2713  * Try to allocate a partial slab from a specific node.
2714  */
2715 static struct slab *get_partial_node(struct kmem_cache *s,
2716                                      struct kmem_cache_node *n,
2717                                      struct partial_context *pc)
2718 {
2719         struct slab *slab, *slab2, *partial = NULL;
2720         unsigned long flags;
2721         unsigned int partial_slabs = 0;
2722
2723         /*
2724          * Racy check. If we mistakenly see no partial slabs then we
2725          * just allocate an empty slab. If we mistakenly try to get a
2726          * partial slab and there is none available then get_partial()
2727          * will return NULL.
2728          */
2729         if (!n || !n->nr_partial)
2730                 return NULL;
2731
2732         spin_lock_irqsave(&n->list_lock, flags);
2733         list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
2734                 if (!pfmemalloc_match(slab, pc->flags))
2735                         continue;
2736
2737                 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
2738                         void *object = alloc_single_from_partial(s, n, slab,
2739                                                         pc->orig_size);
2740                         if (object) {
2741                                 partial = slab;
2742                                 pc->object = object;
2743                                 break;
2744                         }
2745                         continue;
2746                 }
2747
2748                 remove_partial(n, slab);
2749
2750                 if (!partial) {
2751                         partial = slab;
2752                         stat(s, ALLOC_FROM_PARTIAL);
2753
2754                         if ((slub_get_cpu_partial(s) == 0)) {
2755                                 break;
2756                         }
2757                 } else {
2758                         put_cpu_partial(s, slab, 0);
2759                         stat(s, CPU_PARTIAL_NODE);
2760
2761                         if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
2762                                 break;
2763                         }
2764                 }
2765         }
2766         spin_unlock_irqrestore(&n->list_lock, flags);
2767         return partial;
2768 }
2769
2770 /*
2771  * Get a slab from somewhere. Search in increasing NUMA distances.
2772  */
2773 static struct slab *get_any_partial(struct kmem_cache *s,
2774                                     struct partial_context *pc)
2775 {
2776 #ifdef CONFIG_NUMA
2777         struct zonelist *zonelist;
2778         struct zoneref *z;
2779         struct zone *zone;
2780         enum zone_type highest_zoneidx = gfp_zone(pc->flags);
2781         struct slab *slab;
2782         unsigned int cpuset_mems_cookie;
2783
2784         /*
2785          * The defrag ratio allows a configuration of the tradeoffs between
2786          * inter node defragmentation and node local allocations. A lower
2787          * defrag_ratio increases the tendency to do local allocations
2788          * instead of attempting to obtain partial slabs from other nodes.
2789          *
2790          * If the defrag_ratio is set to 0 then kmalloc() always
2791          * returns node local objects. If the ratio is higher then kmalloc()
2792          * may return off node objects because partial slabs are obtained
2793          * from other nodes and filled up.
2794          *
2795          * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
2796          * (which makes defrag_ratio = 1000) then every (well almost)
2797          * allocation will first attempt to defrag slab caches on other nodes.
2798          * This means scanning over all nodes to look for partial slabs which
2799          * may be expensive if we do it every time we are trying to find a slab
2800          * with available objects.
2801          */
2802         if (!s->remote_node_defrag_ratio ||
2803                         get_cycles() % 1024 > s->remote_node_defrag_ratio)
2804                 return NULL;
2805
2806         do {
2807                 cpuset_mems_cookie = read_mems_allowed_begin();
2808                 zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
2809                 for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
2810                         struct kmem_cache_node *n;
2811
2812                         n = get_node(s, zone_to_nid(zone));
2813
2814                         if (n && cpuset_zone_allowed(zone, pc->flags) &&
2815                                         n->nr_partial > s->min_partial) {
2816                                 slab = get_partial_node(s, n, pc);
2817                                 if (slab) {
2818                                         /*
2819                                          * Don't check read_mems_allowed_retry()
2820                                          * here - if mems_allowed was updated in
2821                                          * parallel, that was a harmless race
2822                                          * between allocation and the cpuset
2823                                          * update
2824                                          */
2825                                         return slab;
2826                                 }
2827                         }
2828                 }
2829         } while (read_mems_allowed_retry(cpuset_mems_cookie));
2830 #endif  /* CONFIG_NUMA */
2831         return NULL;
2832 }
2833
2834 /*
2835  * Get a partial slab, lock it and return it.
2836  */
2837 static struct slab *get_partial(struct kmem_cache *s, int node,
2838                                 struct partial_context *pc)
2839 {
2840         struct slab *slab;
2841         int searchnode = node;
2842
2843         if (node == NUMA_NO_NODE)
2844                 searchnode = numa_mem_id();
2845
2846         slab = get_partial_node(s, get_node(s, searchnode), pc);
2847         if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
2848                 return slab;
2849
2850         return get_any_partial(s, pc);
2851 }
2852
2853 #ifndef CONFIG_SLUB_TINY
2854
2855 #ifdef CONFIG_PREEMPTION
2856 /*
2857  * Calculate the next globally unique transaction for disambiguation
2858  * during cmpxchg. The transactions start with the cpu number and are then
2859  * incremented by CONFIG_NR_CPUS.
2860  */
2861 #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
2862 #else
2863 /*
2864  * No preemption supported therefore also no need to check for
2865  * different cpus.
2866  */
2867 #define TID_STEP 1
2868 #endif /* CONFIG_PREEMPTION */
2869
2870 static inline unsigned long next_tid(unsigned long tid)
2871 {
2872         return tid + TID_STEP;
2873 }
2874
2875 #ifdef SLUB_DEBUG_CMPXCHG
2876 static inline unsigned int tid_to_cpu(unsigned long tid)
2877 {
2878         return tid % TID_STEP;
2879 }
2880
2881 static inline unsigned long tid_to_event(unsigned long tid)
2882 {
2883         return tid / TID_STEP;
2884 }
2885 #endif
2886
2887 static inline unsigned int init_tid(int cpu)
2888 {
2889         return cpu;
2890 }
2891
2892 static inline void note_cmpxchg_failure(const char *n,
2893                 const struct kmem_cache *s, unsigned long tid)
2894 {
2895 #ifdef SLUB_DEBUG_CMPXCHG
2896         unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
2897
2898         pr_info("%s %s: cmpxchg redo ", n, s->name);
2899
2900 #ifdef CONFIG_PREEMPTION
2901         if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
2902                 pr_warn("due to cpu change %d -> %d\n",
2903                         tid_to_cpu(tid), tid_to_cpu(actual_tid));
2904         else
2905 #endif
2906         if (tid_to_event(tid) != tid_to_event(actual_tid))
2907                 pr_warn("due to cpu running other code. Event %ld->%ld\n",
2908                         tid_to_event(tid), tid_to_event(actual_tid));
2909         else
2910                 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2911                         actual_tid, tid, next_tid(tid));
2912 #endif
2913         stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
2914 }
2915
2916 static void init_kmem_cache_cpus(struct kmem_cache *s)
2917 {
2918         int cpu;
2919         struct kmem_cache_cpu *c;
2920
2921         for_each_possible_cpu(cpu) {
2922                 c = per_cpu_ptr(s->cpu_slab, cpu);
2923                 local_lock_init(&c->lock);
2924                 c->tid = init_tid(cpu);
2925         }
2926 }
2927
2928 /*
2929  * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
2930  * unfreezes the slabs and puts it on the proper list.
2931  * Assumes the slab has been already safely taken away from kmem_cache_cpu
2932  * by the caller.
2933  */
2934 static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
2935                             void *freelist)
2936 {
2937         struct kmem_cache_node *n = get_node(s, slab_nid(slab));
2938         int free_delta = 0;
2939         void *nextfree, *freelist_iter, *freelist_tail;
2940         int tail = DEACTIVATE_TO_HEAD;
2941         unsigned long flags = 0;
2942         struct slab new;
2943         struct slab old;
2944
2945         if (READ_ONCE(slab->freelist)) {
2946                 stat(s, DEACTIVATE_REMOTE_FREES);
2947                 tail = DEACTIVATE_TO_TAIL;
2948         }
2949
2950         /*
2951          * Stage one: Count the objects on cpu's freelist as free_delta and
2952          * remember the last object in freelist_tail for later splicing.
2953          */
2954         freelist_tail = NULL;
2955         freelist_iter = freelist;
2956         while (freelist_iter) {
2957                 nextfree = get_freepointer(s, freelist_iter);
2958
2959                 /*
2960                  * If 'nextfree' is invalid, it is possible that the object at
2961                  * 'freelist_iter' is already corrupted.  So isolate all objects
2962                  * starting at 'freelist_iter' by skipping them.
2963                  */
2964                 if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
2965                         break;
2966
2967                 freelist_tail = freelist_iter;
2968                 free_delta++;
2969
2970                 freelist_iter = nextfree;
2971         }
2972
2973         /*
2974          * Stage two: Unfreeze the slab while splicing the per-cpu
2975          * freelist to the head of slab's freelist.
2976          */
2977         do {
2978                 old.freelist = READ_ONCE(slab->freelist);
2979                 old.counters = READ_ONCE(slab->counters);
2980                 VM_BUG_ON(!old.frozen);
2981
2982                 /* Determine target state of the slab */
2983                 new.counters = old.counters;
2984                 new.frozen = 0;
2985                 if (freelist_tail) {
2986                         new.inuse -= free_delta;
2987                         set_freepointer(s, freelist_tail, old.freelist);
2988                         new.freelist = freelist;
2989                 } else {
2990                         new.freelist = old.freelist;
2991                 }
2992         } while (!slab_update_freelist(s, slab,
2993                 old.freelist, old.counters,
2994                 new.freelist, new.counters,
2995                 "unfreezing slab"));
2996
2997         /*
2998          * Stage three: Manipulate the slab list based on the updated state.
2999          */
3000         if (!new.inuse && n->nr_partial >= s->min_partial) {
3001                 stat(s, DEACTIVATE_EMPTY);
3002                 discard_slab(s, slab);
3003                 stat(s, FREE_SLAB);
3004         } else if (new.freelist) {
3005                 spin_lock_irqsave(&n->list_lock, flags);
3006                 add_partial(n, slab, tail);
3007                 spin_unlock_irqrestore(&n->list_lock, flags);
3008                 stat(s, tail);
3009         } else {
3010                 stat(s, DEACTIVATE_FULL);
3011         }
3012 }
3013
3014 #ifdef CONFIG_SLUB_CPU_PARTIAL
3015 static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
3016 {
3017         struct kmem_cache_node *n = NULL, *n2 = NULL;
3018         struct slab *slab, *slab_to_discard = NULL;
3019         unsigned long flags = 0;
3020
3021         while (partial_slab) {
3022                 slab = partial_slab;
3023                 partial_slab = slab->next;
3024
3025                 n2 = get_node(s, slab_nid(slab));
3026                 if (n != n2) {
3027                         if (n)
3028                                 spin_unlock_irqrestore(&n->list_lock, flags);
3029
3030                         n = n2;
3031                         spin_lock_irqsave(&n->list_lock, flags);
3032                 }
3033
3034                 if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
3035                         slab->next = slab_to_discard;
3036                         slab_to_discard = slab;
3037                 } else {
3038                         add_partial(n, slab, DEACTIVATE_TO_TAIL);
3039                         stat(s, FREE_ADD_PARTIAL);
3040                 }
3041         }
3042
3043         if (n)
3044                 spin_unlock_irqrestore(&n->list_lock, flags);
3045
3046         while (slab_to_discard) {
3047                 slab = slab_to_discard;
3048                 slab_to_discard = slab_to_discard->next;
3049
3050                 stat(s, DEACTIVATE_EMPTY);
3051                 discard_slab(s, slab);
3052                 stat(s, FREE_SLAB);
3053         }
3054 }
3055
3056 /*
3057  * Put all the cpu partial slabs to the node partial list.
3058  */
3059 static void put_partials(struct kmem_cache *s)
3060 {
3061         struct slab *partial_slab;
3062         unsigned long flags;
3063
3064         local_lock_irqsave(&s->cpu_slab->lock, flags);
3065         partial_slab = this_cpu_read(s->cpu_slab->partial);
3066         this_cpu_write(s->cpu_slab->partial, NULL);
3067         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3068
3069         if (partial_slab)
3070                 __put_partials(s, partial_slab);
3071 }
3072
3073 static void put_partials_cpu(struct kmem_cache *s,
3074                              struct kmem_cache_cpu *c)
3075 {
3076         struct slab *partial_slab;
3077
3078         partial_slab = slub_percpu_partial(c);
3079         c->partial = NULL;
3080
3081         if (partial_slab)
3082                 __put_partials(s, partial_slab);
3083 }
3084
3085 /*
3086  * Put a slab into a partial slab slot if available.
3087  *
3088  * If we did not find a slot then simply move all the partials to the
3089  * per node partial list.
3090  */
3091 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
3092 {
3093         struct slab *oldslab;
3094         struct slab *slab_to_put = NULL;
3095         unsigned long flags;
3096         int slabs = 0;
3097
3098         local_lock_irqsave(&s->cpu_slab->lock, flags);
3099
3100         oldslab = this_cpu_read(s->cpu_slab->partial);
3101
3102         if (oldslab) {
3103                 if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
3104                         /*
3105                          * Partial array is full. Move the existing set to the
3106                          * per node partial list. Postpone the actual unfreezing
3107                          * outside of the critical section.
3108                          */
3109                         slab_to_put = oldslab;
3110                         oldslab = NULL;
3111                 } else {
3112                         slabs = oldslab->slabs;
3113                 }
3114         }
3115
3116         slabs++;
3117
3118         slab->slabs = slabs;
3119         slab->next = oldslab;
3120
3121         this_cpu_write(s->cpu_slab->partial, slab);
3122
3123         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3124
3125         if (slab_to_put) {
3126                 __put_partials(s, slab_to_put);
3127                 stat(s, CPU_PARTIAL_DRAIN);
3128         }
3129 }
3130
3131 #else   /* CONFIG_SLUB_CPU_PARTIAL */
3132
3133 static inline void put_partials(struct kmem_cache *s) { }
3134 static inline void put_partials_cpu(struct kmem_cache *s,
3135                                     struct kmem_cache_cpu *c) { }
3136
3137 #endif  /* CONFIG_SLUB_CPU_PARTIAL */
3138
3139 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
3140 {
3141         unsigned long flags;
3142         struct slab *slab;
3143         void *freelist;
3144
3145         local_lock_irqsave(&s->cpu_slab->lock, flags);
3146
3147         slab = c->slab;
3148         freelist = c->freelist;
3149
3150         c->slab = NULL;
3151         c->freelist = NULL;
3152         c->tid = next_tid(c->tid);
3153
3154         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3155
3156         if (slab) {
3157                 deactivate_slab(s, slab, freelist);
3158                 stat(s, CPUSLAB_FLUSH);
3159         }
3160 }
3161
3162 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
3163 {
3164         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3165         void *freelist = c->freelist;
3166         struct slab *slab = c->slab;
3167
3168         c->slab = NULL;
3169         c->freelist = NULL;
3170         c->tid = next_tid(c->tid);
3171
3172         if (slab) {
3173                 deactivate_slab(s, slab, freelist);
3174                 stat(s, CPUSLAB_FLUSH);
3175         }
3176
3177         put_partials_cpu(s, c);
3178 }
3179
3180 struct slub_flush_work {
3181         struct work_struct work;
3182         struct kmem_cache *s;
3183         bool skip;
3184 };
3185
3186 /*
3187  * Flush cpu slab.
3188  *
3189  * Called from CPU work handler with migration disabled.
3190  */
3191 static void flush_cpu_slab(struct work_struct *w)
3192 {
3193         struct kmem_cache *s;
3194         struct kmem_cache_cpu *c;
3195         struct slub_flush_work *sfw;
3196
3197         sfw = container_of(w, struct slub_flush_work, work);
3198
3199         s = sfw->s;
3200         c = this_cpu_ptr(s->cpu_slab);
3201
3202         if (c->slab)
3203                 flush_slab(s, c);
3204
3205         put_partials(s);
3206 }
3207
3208 static bool has_cpu_slab(int cpu, struct kmem_cache *s)
3209 {
3210         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3211
3212         return c->slab || slub_percpu_partial(c);
3213 }
3214
3215 static DEFINE_MUTEX(flush_lock);
3216 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
3217
3218 static void flush_all_cpus_locked(struct kmem_cache *s)
3219 {
3220         struct slub_flush_work *sfw;
3221         unsigned int cpu;
3222
3223         lockdep_assert_cpus_held();
3224         mutex_lock(&flush_lock);
3225
3226         for_each_online_cpu(cpu) {
3227                 sfw = &per_cpu(slub_flush, cpu);
3228                 if (!has_cpu_slab(cpu, s)) {
3229                         sfw->skip = true;
3230                         continue;
3231                 }
3232                 INIT_WORK(&sfw->work, flush_cpu_slab);
3233                 sfw->skip = false;
3234                 sfw->s = s;
3235                 queue_work_on(cpu, flushwq, &sfw->work);
3236         }
3237
3238         for_each_online_cpu(cpu) {
3239                 sfw = &per_cpu(slub_flush, cpu);
3240                 if (sfw->skip)
3241                         continue;
3242                 flush_work(&sfw->work);
3243         }
3244
3245         mutex_unlock(&flush_lock);
3246 }
3247
3248 static void flush_all(struct kmem_cache *s)
3249 {
3250         cpus_read_lock();
3251         flush_all_cpus_locked(s);
3252         cpus_read_unlock();
3253 }
3254
3255 /*
3256  * Use the cpu notifier to insure that the cpu slabs are flushed when
3257  * necessary.
3258  */
3259 static int slub_cpu_dead(unsigned int cpu)
3260 {
3261         struct kmem_cache *s;
3262
3263         mutex_lock(&slab_mutex);
3264         list_for_each_entry(s, &slab_caches, list)
3265                 __flush_cpu_slab(s, cpu);
3266         mutex_unlock(&slab_mutex);
3267         return 0;
3268 }
3269
3270 #else /* CONFIG_SLUB_TINY */
3271 static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
3272 static inline void flush_all(struct kmem_cache *s) { }
3273 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
3274 static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
3275 #endif /* CONFIG_SLUB_TINY */
3276
3277 /*
3278  * Check if the objects in a per cpu structure fit numa
3279  * locality expectations.
3280  */
3281 static inline int node_match(struct slab *slab, int node)
3282 {
3283 #ifdef CONFIG_NUMA
3284         if (node != NUMA_NO_NODE && slab_nid(slab) != node)
3285                 return 0;
3286 #endif
3287         return 1;
3288 }
3289
3290 #ifdef CONFIG_SLUB_DEBUG
3291 static int count_free(struct slab *slab)
3292 {
3293         return slab->objects - slab->inuse;
3294 }
3295
3296 static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
3297 {
3298         return atomic_long_read(&n->total_objects);
3299 }
3300
3301 /* Supports checking bulk free of a constructed freelist */
3302 static inline bool free_debug_processing(struct kmem_cache *s,
3303         struct slab *slab, void *head, void *tail, int *bulk_cnt,
3304         unsigned long addr, depot_stack_handle_t handle)
3305 {
3306         bool checks_ok = false;
3307         void *object = head;
3308         int cnt = 0;
3309
3310         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
3311                 if (!check_slab(s, slab))
3312                         goto out;
3313         }
3314
3315         if (slab->inuse < *bulk_cnt) {
3316                 slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
3317                          slab->inuse, *bulk_cnt);
3318                 goto out;
3319         }
3320
3321 next_object:
3322
3323         if (++cnt > *bulk_cnt)
3324                 goto out_cnt;
3325
3326         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
3327                 if (!free_consistency_checks(s, slab, object, addr))
3328                         goto out;
3329         }
3330
3331         if (s->flags & SLAB_STORE_USER)
3332                 set_track_update(s, object, TRACK_FREE, addr, handle);
3333         trace(s, slab, object, 0);
3334         /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
3335         init_object(s, object, SLUB_RED_INACTIVE);
3336
3337         /* Reached end of constructed freelist yet? */
3338         if (object != tail) {
3339                 object = get_freepointer(s, object);
3340                 goto next_object;
3341         }
3342         checks_ok = true;
3343
3344 out_cnt:
3345         if (cnt != *bulk_cnt) {
3346                 slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
3347                          *bulk_cnt, cnt);
3348                 *bulk_cnt = cnt;
3349         }
3350
3351 out:
3352
3353         if (!checks_ok)
3354                 slab_fix(s, "Object at 0x%p not freed", object);
3355
3356         return checks_ok;
3357 }
3358 #endif /* CONFIG_SLUB_DEBUG */
3359
3360 #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
3361 static unsigned long count_partial(struct kmem_cache_node *n,
3362                                         int (*get_count)(struct slab *))
3363 {
3364         unsigned long flags;
3365         unsigned long x = 0;
3366         struct slab *slab;
3367
3368         spin_lock_irqsave(&n->list_lock, flags);
3369         list_for_each_entry(slab, &n->partial, slab_list)
3370                 x += get_count(slab);
3371         spin_unlock_irqrestore(&n->list_lock, flags);
3372         return x;
3373 }
3374 #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
3375
3376 #ifdef CONFIG_SLUB_DEBUG
3377 #define MAX_PARTIAL_TO_SCAN 10000
3378
3379 static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
3380 {
3381         unsigned long flags;
3382         unsigned long x = 0;
3383         struct slab *slab;
3384
3385         spin_lock_irqsave(&n->list_lock, flags);
3386         if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
3387                 list_for_each_entry(slab, &n->partial, slab_list)
3388                         x += slab->objects - slab->inuse;
3389         } else {
3390                 /*
3391                  * For a long list, approximate the total count of objects in
3392                  * it to meet the limit on the number of slabs to scan.
3393                  * Scan from both the list's head and tail for better accuracy.
3394                  */
3395                 unsigned long scanned = 0;
3396
3397                 list_for_each_entry(slab, &n->partial, slab_list) {
3398                         x += slab->objects - slab->inuse;
3399                         if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
3400                                 break;
3401                 }
3402                 list_for_each_entry_reverse(slab, &n->partial, slab_list) {
3403                         x += slab->objects - slab->inuse;
3404                         if (++scanned == MAX_PARTIAL_TO_SCAN)
3405                                 break;
3406                 }
3407                 x = mult_frac(x, n->nr_partial, scanned);
3408                 x = min(x, node_nr_objs(n));
3409         }
3410         spin_unlock_irqrestore(&n->list_lock, flags);
3411         return x;
3412 }
3413
3414 static noinline void
3415 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
3416 {
3417         static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
3418                                       DEFAULT_RATELIMIT_BURST);
3419         int node;
3420         struct kmem_cache_node *n;
3421
3422         if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
3423                 return;
3424
3425         pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
3426                 nid, gfpflags, &gfpflags);
3427         pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
3428                 s->name, s->object_size, s->size, oo_order(s->oo),
3429                 oo_order(s->min));
3430
3431         if (oo_order(s->min) > get_order(s->object_size))
3432                 pr_warn("  %s debugging increased min order, use slab_debug=O to disable.\n",
3433                         s->name);
3434
3435         for_each_kmem_cache_node(s, node, n) {
3436                 unsigned long nr_slabs;
3437                 unsigned long nr_objs;
3438                 unsigned long nr_free;
3439
3440                 nr_free  = count_partial_free_approx(n);
3441                 nr_slabs = node_nr_slabs(n);
3442                 nr_objs  = node_nr_objs(n);
3443
3444                 pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
3445                         node, nr_slabs, nr_objs, nr_free);
3446         }
3447 }
3448 #else /* CONFIG_SLUB_DEBUG */
3449 static inline void
3450 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
3451 #endif
3452
3453 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
3454 {
3455         if (unlikely(slab_test_pfmemalloc(slab)))
3456                 return gfp_pfmemalloc_allowed(gfpflags);
3457
3458         return true;
3459 }
3460
3461 #ifndef CONFIG_SLUB_TINY
3462 static inline bool
3463 __update_cpu_freelist_fast(struct kmem_cache *s,
3464                            void *freelist_old, void *freelist_new,
3465                            unsigned long tid)
3466 {
3467         freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
3468         freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
3469
3470         return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
3471                                              &old.full, new.full);
3472 }
3473
3474 /*
3475  * Check the slab->freelist and either transfer the freelist to the
3476  * per cpu freelist or deactivate the slab.
3477  *
3478  * The slab is still frozen if the return value is not NULL.
3479  *
3480  * If this function returns NULL then the slab has been unfrozen.
3481  */
3482 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
3483 {
3484         struct slab new;
3485         unsigned long counters;
3486         void *freelist;
3487
3488         lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
3489
3490         do {
3491                 freelist = slab->freelist;
3492                 counters = slab->counters;
3493
3494                 new.counters = counters;
3495
3496                 new.inuse = slab->objects;
3497                 new.frozen = freelist != NULL;
3498
3499         } while (!__slab_update_freelist(s, slab,
3500                 freelist, counters,
3501                 NULL, new.counters,
3502                 "get_freelist"));
3503
3504         return freelist;
3505 }
3506
3507 /*
3508  * Freeze the partial slab and return the pointer to the freelist.
3509  */
3510 static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
3511 {
3512         struct slab new;
3513         unsigned long counters;
3514         void *freelist;
3515
3516         do {
3517                 freelist = slab->freelist;
3518                 counters = slab->counters;
3519
3520                 new.counters = counters;
3521                 VM_BUG_ON(new.frozen);
3522
3523                 new.inuse = slab->objects;
3524                 new.frozen = 1;
3525
3526         } while (!slab_update_freelist(s, slab,
3527                 freelist, counters,
3528                 NULL, new.counters,
3529                 "freeze_slab"));
3530
3531         return freelist;
3532 }
3533
3534 /*
3535  * Slow path. The lockless freelist is empty or we need to perform
3536  * debugging duties.
3537  *
3538  * Processing is still very fast if new objects have been freed to the
3539  * regular freelist. In that case we simply take over the regular freelist
3540  * as the lockless freelist and zap the regular freelist.
3541  *
3542  * If that is not working then we fall back to the partial lists. We take the
3543  * first element of the freelist as the object to allocate now and move the
3544  * rest of the freelist to the lockless freelist.
3545  *
3546  * And if we were unable to get a new slab from the partial slab lists then
3547  * we need to allocate a new slab. This is the slowest path since it involves
3548  * a call to the page allocator and the setup of a new slab.
3549  *
3550  * Version of __slab_alloc to use when we know that preemption is
3551  * already disabled (which is the case for bulk allocation).
3552  */
3553 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
3554                           unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
3555 {
3556         void *freelist;
3557         struct slab *slab;
3558         unsigned long flags;
3559         struct partial_context pc;
3560         bool try_thisnode = true;
3561
3562         stat(s, ALLOC_SLOWPATH);
3563
3564 reread_slab:
3565
3566         slab = READ_ONCE(c->slab);
3567         if (!slab) {
3568                 /*
3569                  * if the node is not online or has no normal memory, just
3570                  * ignore the node constraint
3571                  */
3572                 if (unlikely(node != NUMA_NO_NODE &&
3573                              !node_isset(node, slab_nodes)))
3574                         node = NUMA_NO_NODE;
3575                 goto new_slab;
3576         }
3577
3578         if (unlikely(!node_match(slab, node))) {
3579                 /*
3580                  * same as above but node_match() being false already
3581                  * implies node != NUMA_NO_NODE
3582                  */
3583                 if (!node_isset(node, slab_nodes)) {
3584                         node = NUMA_NO_NODE;
3585                 } else {
3586                         stat(s, ALLOC_NODE_MISMATCH);
3587                         goto deactivate_slab;
3588                 }
3589         }
3590
3591         /*
3592          * By rights, we should be searching for a slab page that was
3593          * PFMEMALLOC but right now, we are losing the pfmemalloc
3594          * information when the page leaves the per-cpu allocator
3595          */
3596         if (unlikely(!pfmemalloc_match(slab, gfpflags)))
3597                 goto deactivate_slab;
3598
3599         /* must check again c->slab in case we got preempted and it changed */
3600         local_lock_irqsave(&s->cpu_slab->lock, flags);
3601         if (unlikely(slab != c->slab)) {
3602                 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3603                 goto reread_slab;
3604         }
3605         freelist = c->freelist;
3606         if (freelist)
3607                 goto load_freelist;
3608
3609         freelist = get_freelist(s, slab);
3610
3611         if (!freelist) {
3612                 c->slab = NULL;
3613                 c->tid = next_tid(c->tid);
3614                 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3615                 stat(s, DEACTIVATE_BYPASS);
3616                 goto new_slab;
3617         }
3618
3619         stat(s, ALLOC_REFILL);
3620
3621 load_freelist:
3622
3623         lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
3624
3625         /*
3626          * freelist is pointing to the list of objects to be used.
3627          * slab is pointing to the slab from which the objects are obtained.
3628          * That slab must be frozen for per cpu allocations to work.
3629          */
3630         VM_BUG_ON(!c->slab->frozen);
3631         c->freelist = get_freepointer(s, freelist);
3632         c->tid = next_tid(c->tid);
3633         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3634         return freelist;
3635
3636 deactivate_slab:
3637
3638         local_lock_irqsave(&s->cpu_slab->lock, flags);
3639         if (slab != c->slab) {
3640                 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3641                 goto reread_slab;
3642         }
3643         freelist = c->freelist;
3644         c->slab = NULL;
3645         c->freelist = NULL;
3646         c->tid = next_tid(c->tid);
3647         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3648         deactivate_slab(s, slab, freelist);
3649
3650 new_slab:
3651
3652 #ifdef CONFIG_SLUB_CPU_PARTIAL
3653         while (slub_percpu_partial(c)) {
3654                 local_lock_irqsave(&s->cpu_slab->lock, flags);
3655                 if (unlikely(c->slab)) {
3656                         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3657                         goto reread_slab;
3658                 }
3659                 if (unlikely(!slub_percpu_partial(c))) {
3660                         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3661                         /* we were preempted and partial list got empty */
3662                         goto new_objects;
3663                 }
3664
3665                 slab = slub_percpu_partial(c);
3666                 slub_set_percpu_partial(c, slab);
3667
3668                 if (likely(node_match(slab, node) &&
3669                            pfmemalloc_match(slab, gfpflags))) {
3670                         c->slab = slab;
3671                         freelist = get_freelist(s, slab);
3672                         VM_BUG_ON(!freelist);
3673                         stat(s, CPU_PARTIAL_ALLOC);
3674                         goto load_freelist;
3675                 }
3676
3677                 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3678
3679                 slab->next = NULL;
3680                 __put_partials(s, slab);
3681         }
3682 #endif
3683
3684 new_objects:
3685
3686         pc.flags = gfpflags;
3687         /*
3688          * When a preferred node is indicated but no __GFP_THISNODE
3689          *
3690          * 1) try to get a partial slab from target node only by having
3691          *    __GFP_THISNODE in pc.flags for get_partial()
3692          * 2) if 1) failed, try to allocate a new slab from target node with
3693          *    GPF_NOWAIT | __GFP_THISNODE opportunistically
3694          * 3) if 2) failed, retry with original gfpflags which will allow
3695          *    get_partial() try partial lists of other nodes before potentially
3696          *    allocating new page from other nodes
3697          */
3698         if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
3699                      && try_thisnode))
3700                 pc.flags = GFP_NOWAIT | __GFP_THISNODE;
3701
3702         pc.orig_size = orig_size;
3703         slab = get_partial(s, node, &pc);
3704         if (slab) {
3705                 if (kmem_cache_debug(s)) {
3706                         freelist = pc.object;
3707                         /*
3708                          * For debug caches here we had to go through
3709                          * alloc_single_from_partial() so just store the
3710                          * tracking info and return the object.
3711                          */
3712                         if (s->flags & SLAB_STORE_USER)
3713                                 set_track(s, freelist, TRACK_ALLOC, addr);
3714
3715                         return freelist;
3716                 }
3717
3718                 freelist = freeze_slab(s, slab);
3719                 goto retry_load_slab;
3720         }
3721
3722         slub_put_cpu_ptr(s->cpu_slab);
3723         slab = new_slab(s, pc.flags, node);
3724         c = slub_get_cpu_ptr(s->cpu_slab);
3725
3726         if (unlikely(!slab)) {
3727                 if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
3728                     && try_thisnode) {
3729                         try_thisnode = false;
3730                         goto new_objects;
3731                 }
3732                 slab_out_of_memory(s, gfpflags, node);
3733                 return NULL;
3734         }
3735
3736         stat(s, ALLOC_SLAB);
3737
3738         if (kmem_cache_debug(s)) {
3739                 freelist = alloc_single_from_new_slab(s, slab, orig_size);
3740
3741                 if (unlikely(!freelist))
3742                         goto new_objects;
3743
3744                 if (s->flags & SLAB_STORE_USER)
3745                         set_track(s, freelist, TRACK_ALLOC, addr);
3746
3747                 return freelist;
3748         }
3749
3750         /*
3751          * No other reference to the slab yet so we can
3752          * muck around with it freely without cmpxchg
3753          */
3754         freelist = slab->freelist;
3755         slab->freelist = NULL;
3756         slab->inuse = slab->objects;
3757         slab->frozen = 1;
3758
3759         inc_slabs_node(s, slab_nid(slab), slab->objects);
3760
3761         if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
3762                 /*
3763                  * For !pfmemalloc_match() case we don't load freelist so that
3764                  * we don't make further mismatched allocations easier.
3765                  */
3766                 deactivate_slab(s, slab, get_freepointer(s, freelist));
3767                 return freelist;
3768         }
3769
3770 retry_load_slab:
3771
3772         local_lock_irqsave(&s->cpu_slab->lock, flags);
3773         if (unlikely(c->slab)) {
3774                 void *flush_freelist = c->freelist;
3775                 struct slab *flush_slab = c->slab;
3776
3777                 c->slab = NULL;
3778                 c->freelist = NULL;
3779                 c->tid = next_tid(c->tid);
3780
3781                 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3782
3783                 deactivate_slab(s, flush_slab, flush_freelist);
3784
3785                 stat(s, CPUSLAB_FLUSH);
3786
3787                 goto retry_load_slab;
3788         }
3789         c->slab = slab;
3790
3791         goto load_freelist;
3792 }
3793
3794 /*
3795  * A wrapper for ___slab_alloc() for contexts where preemption is not yet
3796  * disabled. Compensates for possible cpu changes by refetching the per cpu area
3797  * pointer.
3798  */
3799 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
3800                           unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
3801 {
3802         void *p;
3803
3804 #ifdef CONFIG_PREEMPT_COUNT
3805         /*
3806          * We may have been preempted and rescheduled on a different
3807          * cpu before disabling preemption. Need to reload cpu area
3808          * pointer.
3809          */
3810         c = slub_get_cpu_ptr(s->cpu_slab);
3811 #endif
3812
3813         p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
3814 #ifdef CONFIG_PREEMPT_COUNT
3815         slub_put_cpu_ptr(s->cpu_slab);
3816 #endif
3817         return p;
3818 }
3819
3820 static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
3821                 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
3822 {
3823         struct kmem_cache_cpu *c;
3824         struct slab *slab;
3825         unsigned long tid;
3826         void *object;
3827
3828 redo:
3829         /*
3830          * Must read kmem_cache cpu data via this cpu ptr. Preemption is
3831          * enabled. We may switch back and forth between cpus while
3832          * reading from one cpu area. That does not matter as long
3833          * as we end up on the original cpu again when doing the cmpxchg.
3834          *
3835          * We must guarantee that tid and kmem_cache_cpu are retrieved on the
3836          * same cpu. We read first the kmem_cache_cpu pointer and use it to read
3837          * the tid. If we are preempted and switched to another cpu between the
3838          * two reads, it's OK as the two are still associated with the same cpu
3839          * and cmpxchg later will validate the cpu.
3840          */
3841         c = raw_cpu_ptr(s->cpu_slab);
3842         tid = READ_ONCE(c->tid);
3843
3844         /*
3845          * Irqless object alloc/free algorithm used here depends on sequence
3846          * of fetching cpu_slab's data. tid should be fetched before anything
3847          * on c to guarantee that object and slab associated with previous tid
3848          * won't be used with current tid. If we fetch tid first, object and
3849          * slab could be one associated with next tid and our alloc/free
3850          * request will be failed. In this case, we will retry. So, no problem.
3851          */
3852         barrier();
3853
3854         /*
3855          * The transaction ids are globally unique per cpu and per operation on
3856          * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
3857          * occurs on the right processor and that there was no operation on the
3858          * linked list in between.
3859          */
3860
3861         object = c->freelist;
3862         slab = c->slab;
3863
3864         if (!USE_LOCKLESS_FAST_PATH() ||
3865             unlikely(!object || !slab || !node_match(slab, node))) {
3866                 object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
3867         } else {
3868                 void *next_object = get_freepointer_safe(s, object);
3869
3870                 /*
3871                  * The cmpxchg will only match if there was no additional
3872                  * operation and if we are on the right processor.
3873                  *
3874                  * The cmpxchg does the following atomically (without lock
3875                  * semantics!)
3876                  * 1. Relocate first pointer to the current per cpu area.
3877                  * 2. Verify that tid and freelist have not been changed
3878                  * 3. If they were not changed replace tid and freelist
3879                  *
3880                  * Since this is without lock semantics the protection is only
3881                  * against code executing on this cpu *not* from access by
3882                  * other cpus.
3883                  */
3884                 if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
3885                         note_cmpxchg_failure("slab_alloc", s, tid);
3886                         goto redo;
3887                 }
3888                 prefetch_freepointer(s, next_object);
3889                 stat(s, ALLOC_FASTPATH);
3890         }
3891
3892         return object;
3893 }
3894 #else /* CONFIG_SLUB_TINY */
3895 static void *__slab_alloc_node(struct kmem_cache *s,
3896                 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
3897 {
3898         struct partial_context pc;
3899         struct slab *slab;
3900         void *object;
3901
3902         pc.flags = gfpflags;
3903         pc.orig_size = orig_size;
3904         slab = get_partial(s, node, &pc);
3905
3906         if (slab)
3907                 return pc.object;
3908
3909         slab = new_slab(s, gfpflags, node);
3910         if (unlikely(!slab)) {
3911                 slab_out_of_memory(s, gfpflags, node);
3912                 return NULL;
3913         }
3914
3915         object = alloc_single_from_new_slab(s, slab, orig_size);
3916
3917         return object;
3918 }
3919 #endif /* CONFIG_SLUB_TINY */
3920
3921 /*
3922  * If the object has been wiped upon free, make sure it's fully initialized by
3923  * zeroing out freelist pointer.
3924  */
3925 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
3926                                                    void *obj)
3927 {
3928         if (unlikely(slab_want_init_on_free(s)) && obj &&
3929             !freeptr_outside_object(s))
3930                 memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
3931                         0, sizeof(void *));
3932 }
3933
3934 static __fastpath_inline
3935 struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
3936 {
3937         flags &= gfp_allowed_mask;
3938
3939         might_alloc(flags);
3940
3941         if (unlikely(should_failslab(s, flags)))
3942                 return NULL;
3943
3944         return s;
3945 }
3946
3947 static __fastpath_inline
3948 bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
3949                           gfp_t flags, size_t size, void **p, bool init,
3950                           unsigned int orig_size)
3951 {
3952         unsigned int zero_size = s->object_size;
3953         bool kasan_init = init;
3954         size_t i;
3955         gfp_t init_flags = flags & gfp_allowed_mask;
3956
3957         /*
3958          * For kmalloc object, the allocated memory size(object_size) is likely
3959          * larger than the requested size(orig_size). If redzone check is
3960          * enabled for the extra space, don't zero it, as it will be redzoned
3961          * soon. The redzone operation for this extra space could be seen as a
3962          * replacement of current poisoning under certain debug option, and
3963          * won't break other sanity checks.
3964          */
3965         if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
3966             (s->flags & SLAB_KMALLOC))
3967                 zero_size = orig_size;
3968
3969         /*
3970          * When slab_debug is enabled, avoid memory initialization integrated
3971          * into KASAN and instead zero out the memory via the memset below with
3972          * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
3973          * cause false-positive reports. This does not lead to a performance
3974          * penalty on production builds, as slab_debug is not intended to be
3975          * enabled there.
3976          */
3977         if (__slub_debug_enabled())
3978                 kasan_init = false;
3979
3980         /*
3981          * As memory initialization might be integrated into KASAN,
3982          * kasan_slab_alloc and initialization memset must be
3983          * kept together to avoid discrepancies in behavior.
3984          *
3985          * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
3986          */
3987         for (i = 0; i < size; i++) {
3988                 p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
3989                 if (p[i] && init && (!kasan_init ||
3990                                      !kasan_has_integrated_init()))
3991                         memset(p[i], 0, zero_size);
3992                 kmemleak_alloc_recursive(p[i], s->object_size, 1,
3993                                          s->flags, init_flags);
3994                 kmsan_slab_alloc(s, p[i], init_flags);
3995                 alloc_tagging_slab_alloc_hook(s, p[i], flags);
3996         }
3997
3998         return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
3999 }
4000
4001 /*
4002  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
4003  * have the fastpath folded into their functions. So no function call
4004  * overhead for requests that can be satisfied on the fastpath.
4005  *
4006  * The fastpath works by first checking if the lockless freelist can be used.
4007  * If not then __slab_alloc is called for slow processing.
4008  *
4009  * Otherwise we can simply pick the next object from the lockless free list.
4010  */
4011 static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
4012                 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
4013 {
4014         void *object;
4015         bool init = false;
4016
4017         s = slab_pre_alloc_hook(s, gfpflags);
4018         if (unlikely(!s))
4019                 return NULL;
4020
4021         object = kfence_alloc(s, orig_size, gfpflags);
4022         if (unlikely(object))
4023                 goto out;
4024
4025         object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
4026
4027         maybe_wipe_obj_freeptr(s, object);
4028         init = slab_want_init_on_alloc(gfpflags, s);
4029
4030 out:
4031         /*
4032          * When init equals 'true', like for kzalloc() family, only
4033          * @orig_size bytes might be zeroed instead of s->object_size
4034          * In case this fails due to memcg_slab_post_alloc_hook(),
4035          * object is set to NULL
4036          */
4037         slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
4038
4039         return object;
4040 }
4041
4042 void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
4043 {
4044         void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
4045                                     s->object_size);
4046
4047         trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
4048
4049         return ret;
4050 }
4051 EXPORT_SYMBOL(kmem_cache_alloc_noprof);
4052
4053 void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
4054                            gfp_t gfpflags)
4055 {
4056         void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
4057                                     s->object_size);
4058
4059         trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
4060
4061         return ret;
4062 }
4063 EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
4064
4065 /**
4066  * kmem_cache_alloc_node - Allocate an object on the specified node
4067  * @s: The cache to allocate from.
4068  * @gfpflags: See kmalloc().
4069  * @node: node number of the target node.
4070  *
4071  * Identical to kmem_cache_alloc but it will allocate memory on the given
4072  * node, which can improve the performance for cpu bound structures.
4073  *
4074  * Fallback to other node is possible if __GFP_THISNODE is not set.
4075  *
4076  * Return: pointer to the new object or %NULL in case of error
4077  */
4078 void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
4079 {
4080         void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
4081
4082         trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
4083
4084         return ret;
4085 }
4086 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
4087
4088 /*
4089  * To avoid unnecessary overhead, we pass through large allocation requests
4090  * directly to the page allocator. We use __GFP_COMP, because we will need to
4091  * know the allocation order to free the pages properly in kfree.
4092  */
4093 static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
4094 {
4095         struct folio *folio;
4096         void *ptr = NULL;
4097         unsigned int order = get_order(size);
4098
4099         if (unlikely(flags & GFP_SLAB_BUG_MASK))
4100                 flags = kmalloc_fix_flags(flags);
4101
4102         flags |= __GFP_COMP;
4103         folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
4104         if (folio) {
4105                 ptr = folio_address(folio);
4106                 lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
4107                                       PAGE_SIZE << order);
4108         }
4109
4110         ptr = kasan_kmalloc_large(ptr, size, flags);
4111         /* As ptr might get tagged, call kmemleak hook after KASAN. */
4112         kmemleak_alloc(ptr, size, 1, flags);
4113         kmsan_kmalloc_large(ptr, size, flags);
4114
4115         return ptr;
4116 }
4117
4118 void *__kmalloc_large_noprof(size_t size, gfp_t flags)
4119 {
4120         void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
4121
4122         trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
4123                       flags, NUMA_NO_NODE);
4124         return ret;
4125 }
4126 EXPORT_SYMBOL(__kmalloc_large_noprof);
4127
4128 void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
4129 {
4130         void *ret = ___kmalloc_large_node(size, flags, node);
4131
4132         trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
4133                       flags, node);
4134         return ret;
4135 }
4136 EXPORT_SYMBOL(__kmalloc_large_node_noprof);
4137
4138 static __always_inline
4139 void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
4140                         unsigned long caller)
4141 {
4142         struct kmem_cache *s;
4143         void *ret;
4144
4145         if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4146                 ret = __kmalloc_large_node_noprof(size, flags, node);
4147                 trace_kmalloc(caller, ret, size,
4148                               PAGE_SIZE << get_order(size), flags, node);
4149                 return ret;
4150         }
4151
4152         if (unlikely(!size))
4153                 return ZERO_SIZE_PTR;
4154
4155         s = kmalloc_slab(size, b, flags, caller);
4156
4157         ret = slab_alloc_node(s, NULL, flags, node, caller, size);
4158         ret = kasan_kmalloc(s, ret, size, flags);
4159         trace_kmalloc(caller, ret, size, s->size, flags, node);
4160         return ret;
4161 }
4162 void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
4163 {
4164         return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
4165 }
4166 EXPORT_SYMBOL(__kmalloc_node_noprof);
4167
4168 void *__kmalloc_noprof(size_t size, gfp_t flags)
4169 {
4170         return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
4171 }
4172 EXPORT_SYMBOL(__kmalloc_noprof);
4173
4174 void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
4175                                          int node, unsigned long caller)
4176 {
4177         return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
4178
4179 }
4180 EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
4181
4182 void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
4183 {
4184         void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
4185                                             _RET_IP_, size);
4186
4187         trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
4188
4189         ret = kasan_kmalloc(s, ret, size, gfpflags);
4190         return ret;
4191 }
4192 EXPORT_SYMBOL(__kmalloc_cache_noprof);
4193
4194 void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
4195                                   int node, size_t size)
4196 {
4197         void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
4198
4199         trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
4200
4201         ret = kasan_kmalloc(s, ret, size, gfpflags);
4202         return ret;
4203 }
4204 EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
4205
4206 static noinline void free_to_partial_list(
4207         struct kmem_cache *s, struct slab *slab,
4208         void *head, void *tail, int bulk_cnt,
4209         unsigned long addr)
4210 {
4211         struct kmem_cache_node *n = get_node(s, slab_nid(slab));
4212         struct slab *slab_free = NULL;
4213         int cnt = bulk_cnt;
4214         unsigned long flags;
4215         depot_stack_handle_t handle = 0;
4216
4217         if (s->flags & SLAB_STORE_USER)
4218                 handle = set_track_prepare();
4219
4220         spin_lock_irqsave(&n->list_lock, flags);
4221
4222         if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
4223                 void *prior = slab->freelist;
4224
4225                 /* Perform the actual freeing while we still hold the locks */
4226                 slab->inuse -= cnt;
4227                 set_freepointer(s, tail, prior);
4228                 slab->freelist = head;
4229
4230                 /*
4231                  * If the slab is empty, and node's partial list is full,
4232                  * it should be discarded anyway no matter it's on full or
4233                  * partial list.
4234                  */
4235                 if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
4236                         slab_free = slab;
4237
4238                 if (!prior) {
4239                         /* was on full list */
4240                         remove_full(s, n, slab);
4241                         if (!slab_free) {
4242                                 add_partial(n, slab, DEACTIVATE_TO_TAIL);
4243                                 stat(s, FREE_ADD_PARTIAL);
4244                         }
4245                 } else if (slab_free) {
4246                         remove_partial(n, slab);
4247                         stat(s, FREE_REMOVE_PARTIAL);
4248                 }
4249         }
4250
4251         if (slab_free) {
4252                 /*
4253                  * Update the counters while still holding n->list_lock to
4254                  * prevent spurious validation warnings
4255                  */
4256                 dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
4257         }
4258
4259         spin_unlock_irqrestore(&n->list_lock, flags);
4260
4261         if (slab_free) {
4262                 stat(s, FREE_SLAB);
4263                 free_slab(s, slab_free);
4264         }
4265 }
4266
4267 /*
4268  * Slow path handling. This may still be called frequently since objects
4269  * have a longer lifetime than the cpu slabs in most processing loads.
4270  *
4271  * So we still attempt to reduce cache line usage. Just take the slab
4272  * lock and free the item. If there is no additional partial slab
4273  * handling required then we can return immediately.
4274  */
4275 static void __slab_free(struct kmem_cache *s, struct slab *slab,
4276                         void *head, void *tail, int cnt,
4277                         unsigned long addr)
4278
4279 {
4280         void *prior;
4281         int was_frozen;
4282         struct slab new;
4283         unsigned long counters;
4284         struct kmem_cache_node *n = NULL;
4285         unsigned long flags;
4286         bool on_node_partial;
4287
4288         stat(s, FREE_SLOWPATH);
4289
4290         if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
4291                 free_to_partial_list(s, slab, head, tail, cnt, addr);
4292                 return;
4293         }
4294
4295         do {
4296                 if (unlikely(n)) {
4297                         spin_unlock_irqrestore(&n->list_lock, flags);
4298                         n = NULL;
4299                 }
4300                 prior = slab->freelist;
4301                 counters = slab->counters;
4302                 set_freepointer(s, tail, prior);
4303                 new.counters = counters;
4304                 was_frozen = new.frozen;
4305                 new.inuse -= cnt;
4306                 if ((!new.inuse || !prior) && !was_frozen) {
4307                         /* Needs to be taken off a list */
4308                         if (!kmem_cache_has_cpu_partial(s) || prior) {
4309
4310                                 n = get_node(s, slab_nid(slab));
4311                                 /*
4312                                  * Speculatively acquire the list_lock.
4313                                  * If the cmpxchg does not succeed then we may
4314                                  * drop the list_lock without any processing.
4315                                  *
4316                                  * Otherwise the list_lock will synchronize with
4317                                  * other processors updating the list of slabs.
4318                                  */
4319                                 spin_lock_irqsave(&n->list_lock, flags);
4320
4321                                 on_node_partial = slab_test_node_partial(slab);
4322                         }
4323                 }
4324
4325         } while (!slab_update_freelist(s, slab,
4326                 prior, counters,
4327                 head, new.counters,
4328                 "__slab_free"));
4329
4330         if (likely(!n)) {
4331
4332                 if (likely(was_frozen)) {
4333                         /*
4334                          * The list lock was not taken therefore no list
4335                          * activity can be necessary.
4336                          */
4337                         stat(s, FREE_FROZEN);
4338                 } else if (kmem_cache_has_cpu_partial(s) && !prior) {
4339                         /*
4340                          * If we started with a full slab then put it onto the
4341                          * per cpu partial list.
4342                          */
4343                         put_cpu_partial(s, slab, 1);
4344                         stat(s, CPU_PARTIAL_FREE);
4345                 }
4346
4347                 return;
4348         }
4349
4350         /*
4351          * This slab was partially empty but not on the per-node partial list,
4352          * in which case we shouldn't manipulate its list, just return.
4353          */
4354         if (prior && !on_node_partial) {
4355                 spin_unlock_irqrestore(&n->list_lock, flags);
4356                 return;
4357         }
4358
4359         if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
4360                 goto slab_empty;
4361
4362         /*
4363          * Objects left in the slab. If it was not on the partial list before
4364          * then add it.
4365          */
4366         if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
4367                 add_partial(n, slab, DEACTIVATE_TO_TAIL);
4368                 stat(s, FREE_ADD_PARTIAL);
4369         }
4370         spin_unlock_irqrestore(&n->list_lock, flags);
4371         return;
4372
4373 slab_empty:
4374         if (prior) {
4375                 /*
4376                  * Slab on the partial list.
4377                  */
4378                 remove_partial(n, slab);
4379                 stat(s, FREE_REMOVE_PARTIAL);
4380         }
4381
4382         spin_unlock_irqrestore(&n->list_lock, flags);
4383         stat(s, FREE_SLAB);
4384         discard_slab(s, slab);
4385 }
4386
4387 #ifndef CONFIG_SLUB_TINY
4388 /*
4389  * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
4390  * can perform fastpath freeing without additional function calls.
4391  *
4392  * The fastpath is only possible if we are freeing to the current cpu slab
4393  * of this processor. This typically the case if we have just allocated
4394  * the item before.
4395  *
4396  * If fastpath is not possible then fall back to __slab_free where we deal
4397  * with all sorts of special processing.
4398  *
4399  * Bulk free of a freelist with several objects (all pointing to the
4400  * same slab) possible by specifying head and tail ptr, plus objects
4401  * count (cnt). Bulk free indicated by tail pointer being set.
4402  */
4403 static __always_inline void do_slab_free(struct kmem_cache *s,
4404                                 struct slab *slab, void *head, void *tail,
4405                                 int cnt, unsigned long addr)
4406 {
4407         struct kmem_cache_cpu *c;
4408         unsigned long tid;
4409         void **freelist;
4410
4411 redo:
4412         /*
4413          * Determine the currently cpus per cpu slab.
4414          * The cpu may change afterward. However that does not matter since
4415          * data is retrieved via this pointer. If we are on the same cpu
4416          * during the cmpxchg then the free will succeed.
4417          */
4418         c = raw_cpu_ptr(s->cpu_slab);
4419         tid = READ_ONCE(c->tid);
4420
4421         /* Same with comment on barrier() in __slab_alloc_node() */
4422         barrier();
4423
4424         if (unlikely(slab != c->slab)) {
4425                 __slab_free(s, slab, head, tail, cnt, addr);
4426                 return;
4427         }
4428
4429         if (USE_LOCKLESS_FAST_PATH()) {
4430                 freelist = READ_ONCE(c->freelist);
4431
4432                 set_freepointer(s, tail, freelist);
4433
4434                 if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
4435                         note_cmpxchg_failure("slab_free", s, tid);
4436                         goto redo;
4437                 }
4438         } else {
4439                 /* Update the free list under the local lock */
4440                 local_lock(&s->cpu_slab->lock);
4441                 c = this_cpu_ptr(s->cpu_slab);
4442                 if (unlikely(slab != c->slab)) {
4443                         local_unlock(&s->cpu_slab->lock);
4444                         goto redo;
4445                 }
4446                 tid = c->tid;
4447                 freelist = c->freelist;
4448
4449                 set_freepointer(s, tail, freelist);
4450                 c->freelist = head;
4451                 c->tid = next_tid(tid);
4452
4453                 local_unlock(&s->cpu_slab->lock);
4454         }
4455         stat_add(s, FREE_FASTPATH, cnt);
4456 }
4457 #else /* CONFIG_SLUB_TINY */
4458 static void do_slab_free(struct kmem_cache *s,
4459                                 struct slab *slab, void *head, void *tail,
4460                                 int cnt, unsigned long addr)
4461 {
4462         __slab_free(s, slab, head, tail, cnt, addr);
4463 }
4464 #endif /* CONFIG_SLUB_TINY */
4465
4466 static __fastpath_inline
4467 void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
4468                unsigned long addr)
4469 {
4470         memcg_slab_free_hook(s, slab, &object, 1);
4471         alloc_tagging_slab_free_hook(s, slab, &object, 1);
4472
4473         if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
4474                 do_slab_free(s, slab, object, object, 1, addr);
4475 }
4476
4477 #ifdef CONFIG_MEMCG
4478 /* Do not inline the rare memcg charging failed path into the allocation path */
4479 static noinline
4480 void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
4481 {
4482         if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
4483                 do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
4484 }
4485 #endif
4486
4487 static __fastpath_inline
4488 void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
4489                     void *tail, void **p, int cnt, unsigned long addr)
4490 {
4491         memcg_slab_free_hook(s, slab, p, cnt);
4492         alloc_tagging_slab_free_hook(s, slab, p, cnt);
4493         /*
4494          * With KASAN enabled slab_free_freelist_hook modifies the freelist
4495          * to remove objects, whose reuse must be delayed.
4496          */
4497         if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
4498                 do_slab_free(s, slab, head, tail, cnt, addr);
4499 }
4500
4501 #ifdef CONFIG_KASAN_GENERIC
4502 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
4503 {
4504         do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
4505 }
4506 #endif
4507
4508 static inline struct kmem_cache *virt_to_cache(const void *obj)
4509 {
4510         struct slab *slab;
4511
4512         slab = virt_to_slab(obj);
4513         if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
4514                 return NULL;
4515         return slab->slab_cache;
4516 }
4517
4518 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
4519 {
4520         struct kmem_cache *cachep;
4521
4522         if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
4523             !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
4524                 return s;
4525
4526         cachep = virt_to_cache(x);
4527         if (WARN(cachep && cachep != s,
4528                  "%s: Wrong slab cache. %s but object is from %s\n",
4529                  __func__, s->name, cachep->name))
4530                 print_tracking(cachep, x);
4531         return cachep;
4532 }
4533
4534 /**
4535  * kmem_cache_free - Deallocate an object
4536  * @s: The cache the allocation was from.
4537  * @x: The previously allocated object.
4538  *
4539  * Free an object which was previously allocated from this
4540  * cache.
4541  */
4542 void kmem_cache_free(struct kmem_cache *s, void *x)
4543 {
4544         s = cache_from_obj(s, x);
4545         if (!s)
4546                 return;
4547         trace_kmem_cache_free(_RET_IP_, x, s);
4548         slab_free(s, virt_to_slab(x), x, _RET_IP_);
4549 }
4550 EXPORT_SYMBOL(kmem_cache_free);
4551
4552 static void free_large_kmalloc(struct folio *folio, void *object)
4553 {
4554         unsigned int order = folio_order(folio);
4555
4556         if (WARN_ON_ONCE(order == 0))
4557                 pr_warn_once("object pointer: 0x%p\n", object);
4558
4559         kmemleak_free(object);
4560         kasan_kfree_large(object);
4561         kmsan_kfree_large(object);
4562
4563         lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
4564                               -(PAGE_SIZE << order));
4565         folio_put(folio);
4566 }
4567
4568 /**
4569  * kfree - free previously allocated memory
4570  * @object: pointer returned by kmalloc() or kmem_cache_alloc()
4571  *
4572  * If @object is NULL, no operation is performed.
4573  */
4574 void kfree(const void *object)
4575 {
4576         struct folio *folio;
4577         struct slab *slab;
4578         struct kmem_cache *s;
4579         void *x = (void *)object;
4580
4581         trace_kfree(_RET_IP_, object);
4582
4583         if (unlikely(ZERO_OR_NULL_PTR(object)))
4584                 return;
4585
4586         folio = virt_to_folio(object);
4587         if (unlikely(!folio_test_slab(folio))) {
4588                 free_large_kmalloc(folio, (void *)object);
4589                 return;
4590         }
4591
4592         slab = folio_slab(folio);
4593         s = slab->slab_cache;
4594         slab_free(s, slab, x, _RET_IP_);
4595 }
4596 EXPORT_SYMBOL(kfree);
4597
4598 struct detached_freelist {
4599         struct slab *slab;
4600         void *tail;
4601         void *freelist;
4602         int cnt;
4603         struct kmem_cache *s;
4604 };
4605
4606 /*
4607  * This function progressively scans the array with free objects (with
4608  * a limited look ahead) and extract objects belonging to the same
4609  * slab.  It builds a detached freelist directly within the given
4610  * slab/objects.  This can happen without any need for
4611  * synchronization, because the objects are owned by running process.
4612  * The freelist is build up as a single linked list in the objects.
4613  * The idea is, that this detached freelist can then be bulk
4614  * transferred to the real freelist(s), but only requiring a single
4615  * synchronization primitive.  Look ahead in the array is limited due
4616  * to performance reasons.
4617  */
4618 static inline
4619 int build_detached_freelist(struct kmem_cache *s, size_t size,
4620                             void **p, struct detached_freelist *df)
4621 {
4622         int lookahead = 3;
4623         void *object;
4624         struct folio *folio;
4625         size_t same;
4626
4627         object = p[--size];
4628         folio = virt_to_folio(object);
4629         if (!s) {
4630                 /* Handle kalloc'ed objects */
4631                 if (unlikely(!folio_test_slab(folio))) {
4632                         free_large_kmalloc(folio, object);
4633                         df->slab = NULL;
4634                         return size;
4635                 }
4636                 /* Derive kmem_cache from object */
4637                 df->slab = folio_slab(folio);
4638                 df->s = df->slab->slab_cache;
4639         } else {
4640                 df->slab = folio_slab(folio);
4641                 df->s = cache_from_obj(s, object); /* Support for memcg */
4642         }
4643
4644         /* Start new detached freelist */
4645         df->tail = object;
4646         df->freelist = object;
4647         df->cnt = 1;
4648
4649         if (is_kfence_address(object))
4650                 return size;
4651
4652         set_freepointer(df->s, object, NULL);
4653
4654         same = size;
4655         while (size) {
4656                 object = p[--size];
4657                 /* df->slab is always set at this point */
4658                 if (df->slab == virt_to_slab(object)) {
4659                         /* Opportunity build freelist */
4660                         set_freepointer(df->s, object, df->freelist);
4661                         df->freelist = object;
4662                         df->cnt++;
4663                         same--;
4664                         if (size != same)
4665                                 swap(p[size], p[same]);
4666                         continue;
4667                 }
4668
4669                 /* Limit look ahead search */
4670                 if (!--lookahead)
4671                         break;
4672         }
4673
4674         return same;
4675 }
4676
4677 /*
4678  * Internal bulk free of objects that were not initialised by the post alloc
4679  * hooks and thus should not be processed by the free hooks
4680  */
4681 static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
4682 {
4683         if (!size)
4684                 return;
4685
4686         do {
4687                 struct detached_freelist df;
4688
4689                 size = build_detached_freelist(s, size, p, &df);
4690                 if (!df.slab)
4691                         continue;
4692
4693                 do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
4694                              _RET_IP_);
4695         } while (likely(size));
4696 }
4697
4698 /* Note that interrupts must be enabled when calling this function. */
4699 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
4700 {
4701         if (!size)
4702                 return;
4703
4704         do {
4705                 struct detached_freelist df;
4706
4707                 size = build_detached_freelist(s, size, p, &df);
4708                 if (!df.slab)
4709                         continue;
4710
4711                 slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size],
4712                                df.cnt, _RET_IP_);
4713         } while (likely(size));
4714 }
4715 EXPORT_SYMBOL(kmem_cache_free_bulk);
4716
4717 #ifndef CONFIG_SLUB_TINY
4718 static inline
4719 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
4720                             void **p)
4721 {
4722         struct kmem_cache_cpu *c;
4723         unsigned long irqflags;
4724         int i;
4725
4726         /*
4727          * Drain objects in the per cpu slab, while disabling local
4728          * IRQs, which protects against PREEMPT and interrupts
4729          * handlers invoking normal fastpath.
4730          */
4731         c = slub_get_cpu_ptr(s->cpu_slab);
4732         local_lock_irqsave(&s->cpu_slab->lock, irqflags);
4733
4734         for (i = 0; i < size; i++) {
4735                 void *object = kfence_alloc(s, s->object_size, flags);
4736
4737                 if (unlikely(object)) {
4738                         p[i] = object;
4739                         continue;
4740                 }
4741
4742                 object = c->freelist;
4743                 if (unlikely(!object)) {
4744                         /*
4745                          * We may have removed an object from c->freelist using
4746                          * the fastpath in the previous iteration; in that case,
4747                          * c->tid has not been bumped yet.
4748                          * Since ___slab_alloc() may reenable interrupts while
4749                          * allocating memory, we should bump c->tid now.
4750                          */
4751                         c->tid = next_tid(c->tid);
4752
4753                         local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
4754
4755                         /*
4756                          * Invoking slow path likely have side-effect
4757                          * of re-populating per CPU c->freelist
4758                          */
4759                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
4760                                             _RET_IP_, c, s->object_size);
4761                         if (unlikely(!p[i]))
4762                                 goto error;
4763
4764                         c = this_cpu_ptr(s->cpu_slab);
4765                         maybe_wipe_obj_freeptr(s, p[i]);
4766
4767                         local_lock_irqsave(&s->cpu_slab->lock, irqflags);
4768
4769                         continue; /* goto for-loop */
4770                 }
4771                 c->freelist = get_freepointer(s, object);
4772                 p[i] = object;
4773                 maybe_wipe_obj_freeptr(s, p[i]);
4774                 stat(s, ALLOC_FASTPATH);
4775         }
4776         c->tid = next_tid(c->tid);
4777         local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
4778         slub_put_cpu_ptr(s->cpu_slab);
4779
4780         return i;
4781
4782 error:
4783         slub_put_cpu_ptr(s->cpu_slab);
4784         __kmem_cache_free_bulk(s, i, p);
4785         return 0;
4786
4787 }
4788 #else /* CONFIG_SLUB_TINY */
4789 static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
4790                                    size_t size, void **p)
4791 {
4792         int i;
4793
4794         for (i = 0; i < size; i++) {
4795                 void *object = kfence_alloc(s, s->object_size, flags);
4796
4797                 if (unlikely(object)) {
4798                         p[i] = object;
4799                         continue;
4800                 }
4801
4802                 p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
4803                                          _RET_IP_, s->object_size);
4804                 if (unlikely(!p[i]))
4805                         goto error;
4806
4807                 maybe_wipe_obj_freeptr(s, p[i]);
4808         }
4809
4810         return i;
4811
4812 error:
4813         __kmem_cache_free_bulk(s, i, p);
4814         return 0;
4815 }
4816 #endif /* CONFIG_SLUB_TINY */
4817
4818 /* Note that interrupts must be enabled when calling this function. */
4819 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
4820                                  void **p)
4821 {
4822         int i;
4823
4824         if (!size)
4825                 return 0;
4826
4827         s = slab_pre_alloc_hook(s, flags);
4828         if (unlikely(!s))
4829                 return 0;
4830
4831         i = __kmem_cache_alloc_bulk(s, flags, size, p);
4832         if (unlikely(i == 0))
4833                 return 0;
4834
4835         /*
4836          * memcg and kmem_cache debug support and memory initialization.
4837          * Done outside of the IRQ disabled fastpath loop.
4838          */
4839         if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
4840                     slab_want_init_on_alloc(flags, s), s->object_size))) {
4841                 return 0;
4842         }
4843         return i;
4844 }
4845 EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
4846
4847
4848 /*
4849  * Object placement in a slab is made very easy because we always start at
4850  * offset 0. If we tune the size of the object to the alignment then we can
4851  * get the required alignment by putting one properly sized object after
4852  * another.
4853  *
4854  * Notice that the allocation order determines the sizes of the per cpu
4855  * caches. Each processor has always one slab available for allocations.
4856  * Increasing the allocation order reduces the number of times that slabs
4857  * must be moved on and off the partial lists and is therefore a factor in
4858  * locking overhead.
4859  */
4860
4861 /*
4862  * Minimum / Maximum order of slab pages. This influences locking overhead
4863  * and slab fragmentation. A higher order reduces the number of partial slabs
4864  * and increases the number of allocations possible without having to
4865  * take the list_lock.
4866  */
4867 static unsigned int slub_min_order;
4868 static unsigned int slub_max_order =
4869         IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
4870 static unsigned int slub_min_objects;
4871
4872 /*
4873  * Calculate the order of allocation given an slab object size.
4874  *
4875  * The order of allocation has significant impact on performance and other
4876  * system components. Generally order 0 allocations should be preferred since
4877  * order 0 does not cause fragmentation in the page allocator. Larger objects
4878  * be problematic to put into order 0 slabs because there may be too much
4879  * unused space left. We go to a higher order if more than 1/16th of the slab
4880  * would be wasted.
4881  *
4882  * In order to reach satisfactory performance we must ensure that a minimum
4883  * number of objects is in one slab. Otherwise we may generate too much
4884  * activity on the partial lists which requires taking the list_lock. This is
4885  * less a concern for large slabs though which are rarely used.
4886  *
4887  * slab_max_order specifies the order where we begin to stop considering the
4888  * number of objects in a slab as critical. If we reach slab_max_order then
4889  * we try to keep the page order as low as possible. So we accept more waste
4890  * of space in favor of a small page order.
4891  *
4892  * Higher order allocations also allow the placement of more objects in a
4893  * slab and thereby reduce object handling overhead. If the user has
4894  * requested a higher minimum order then we start with that one instead of
4895  * the smallest order which will fit the object.
4896  */
4897 static inline unsigned int calc_slab_order(unsigned int size,
4898                 unsigned int min_order, unsigned int max_order,
4899                 unsigned int fract_leftover)
4900 {
4901         unsigned int order;
4902
4903         for (order = min_order; order <= max_order; order++) {
4904
4905                 unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
4906                 unsigned int rem;
4907
4908                 rem = slab_size % size;
4909
4910                 if (rem <= slab_size / fract_leftover)
4911                         break;
4912         }
4913
4914         return order;
4915 }
4916
4917 static inline int calculate_order(unsigned int size)
4918 {
4919         unsigned int order;
4920         unsigned int min_objects;
4921         unsigned int max_objects;
4922         unsigned int min_order;
4923
4924         min_objects = slub_min_objects;
4925         if (!min_objects) {
4926                 /*
4927                  * Some architectures will only update present cpus when
4928                  * onlining them, so don't trust the number if it's just 1. But
4929                  * we also don't want to use nr_cpu_ids always, as on some other
4930                  * architectures, there can be many possible cpus, but never
4931                  * onlined. Here we compromise between trying to avoid too high
4932                  * order on systems that appear larger than they are, and too
4933                  * low order on systems that appear smaller than they are.
4934                  */
4935                 unsigned int nr_cpus = num_present_cpus();
4936                 if (nr_cpus <= 1)
4937                         nr_cpus = nr_cpu_ids;
4938                 min_objects = 4 * (fls(nr_cpus) + 1);
4939         }
4940         /* min_objects can't be 0 because get_order(0) is undefined */
4941         max_objects = max(order_objects(slub_max_order, size), 1U);
4942         min_objects = min(min_objects, max_objects);
4943
4944         min_order = max_t(unsigned int, slub_min_order,
4945                           get_order(min_objects * size));
4946         if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
4947                 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
4948
4949         /*
4950          * Attempt to find best configuration for a slab. This works by first
4951          * attempting to generate a layout with the best possible configuration
4952          * and backing off gradually.
4953          *
4954          * We start with accepting at most 1/16 waste and try to find the
4955          * smallest order from min_objects-derived/slab_min_order up to
4956          * slab_max_order that will satisfy the constraint. Note that increasing
4957          * the order can only result in same or less fractional waste, not more.
4958          *
4959          * If that fails, we increase the acceptable fraction of waste and try
4960          * again. The last iteration with fraction of 1/2 would effectively
4961          * accept any waste and give us the order determined by min_objects, as
4962          * long as at least single object fits within slab_max_order.
4963          */
4964         for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
4965                 order = calc_slab_order(size, min_order, slub_max_order,
4966                                         fraction);
4967                 if (order <= slub_max_order)
4968                         return order;
4969         }
4970
4971         /*
4972          * Doh this slab cannot be placed using slab_max_order.
4973          */
4974         order = get_order(size);
4975         if (order <= MAX_PAGE_ORDER)
4976                 return order;
4977         return -ENOSYS;
4978 }
4979
4980 static void
4981 init_kmem_cache_node(struct kmem_cache_node *n)
4982 {
4983         n->nr_partial = 0;
4984         spin_lock_init(&n->list_lock);
4985         INIT_LIST_HEAD(&n->partial);
4986 #ifdef CONFIG_SLUB_DEBUG
4987         atomic_long_set(&n->nr_slabs, 0);
4988         atomic_long_set(&n->total_objects, 0);
4989         INIT_LIST_HEAD(&n->full);
4990 #endif
4991 }
4992
4993 #ifndef CONFIG_SLUB_TINY
4994 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
4995 {
4996         BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
4997                         NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
4998                         sizeof(struct kmem_cache_cpu));
4999
5000         /*
5001          * Must align to double word boundary for the double cmpxchg
5002          * instructions to work; see __pcpu_double_call_return_bool().
5003          */
5004         s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
5005                                      2 * sizeof(void *));
5006
5007         if (!s->cpu_slab)
5008                 return 0;
5009
5010         init_kmem_cache_cpus(s);
5011
5012         return 1;
5013 }
5014 #else
5015 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
5016 {
5017         return 1;
5018 }
5019 #endif /* CONFIG_SLUB_TINY */
5020
5021 static struct kmem_cache *kmem_cache_node;
5022
5023 /*
5024  * No kmalloc_node yet so do it by hand. We know that this is the first
5025  * slab on the node for this slabcache. There are no concurrent accesses
5026  * possible.
5027  *
5028  * Note that this function only works on the kmem_cache_node
5029  * when allocating for the kmem_cache_node. This is used for bootstrapping
5030  * memory on a fresh node that has no slab structures yet.
5031  */
5032 static void early_kmem_cache_node_alloc(int node)
5033 {
5034         struct slab *slab;
5035         struct kmem_cache_node *n;
5036
5037         BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
5038
5039         slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
5040
5041         BUG_ON(!slab);
5042         if (slab_nid(slab) != node) {
5043                 pr_err("SLUB: Unable to allocate memory from node %d\n", node);
5044                 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
5045         }
5046
5047         n = slab->freelist;
5048         BUG_ON(!n);
5049 #ifdef CONFIG_SLUB_DEBUG
5050         init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
5051 #endif
5052         n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
5053         slab->freelist = get_freepointer(kmem_cache_node, n);
5054         slab->inuse = 1;
5055         kmem_cache_node->node[node] = n;
5056         init_kmem_cache_node(n);
5057         inc_slabs_node(kmem_cache_node, node, slab->objects);
5058
5059         /*
5060          * No locks need to be taken here as it has just been
5061          * initialized and there is no concurrent access.
5062          */
5063         __add_partial(n, slab, DEACTIVATE_TO_HEAD);
5064 }
5065
5066 static void free_kmem_cache_nodes(struct kmem_cache *s)
5067 {
5068         int node;
5069         struct kmem_cache_node *n;
5070
5071         for_each_kmem_cache_node(s, node, n) {
5072                 s->node[node] = NULL;
5073                 kmem_cache_free(kmem_cache_node, n);
5074         }
5075 }
5076
5077 void __kmem_cache_release(struct kmem_cache *s)
5078 {
5079         cache_random_seq_destroy(s);
5080 #ifndef CONFIG_SLUB_TINY
5081         free_percpu(s->cpu_slab);
5082 #endif
5083         free_kmem_cache_nodes(s);
5084 }
5085
5086 static int init_kmem_cache_nodes(struct kmem_cache *s)
5087 {
5088         int node;
5089
5090         for_each_node_mask(node, slab_nodes) {
5091                 struct kmem_cache_node *n;
5092
5093                 if (slab_state == DOWN) {
5094                         early_kmem_cache_node_alloc(node);
5095                         continue;
5096                 }
5097                 n = kmem_cache_alloc_node(kmem_cache_node,
5098                                                 GFP_KERNEL, node);
5099
5100                 if (!n) {
5101                         free_kmem_cache_nodes(s);
5102                         return 0;
5103                 }
5104
5105                 init_kmem_cache_node(n);
5106                 s->node[node] = n;
5107         }
5108         return 1;
5109 }
5110
5111 static void set_cpu_partial(struct kmem_cache *s)
5112 {
5113 #ifdef CONFIG_SLUB_CPU_PARTIAL
5114         unsigned int nr_objects;
5115
5116         /*
5117          * cpu_partial determined the maximum number of objects kept in the
5118          * per cpu partial lists of a processor.
5119          *
5120          * Per cpu partial lists mainly contain slabs that just have one
5121          * object freed. If they are used for allocation then they can be
5122          * filled up again with minimal effort. The slab will never hit the
5123          * per node partial lists and therefore no locking will be required.
5124          *
5125          * For backwards compatibility reasons, this is determined as number
5126          * of objects, even though we now limit maximum number of pages, see
5127          * slub_set_cpu_partial()
5128          */
5129         if (!kmem_cache_has_cpu_partial(s))
5130                 nr_objects = 0;
5131         else if (s->size >= PAGE_SIZE)
5132                 nr_objects = 6;
5133         else if (s->size >= 1024)
5134                 nr_objects = 24;
5135         else if (s->size >= 256)
5136                 nr_objects = 52;
5137         else
5138                 nr_objects = 120;
5139
5140         slub_set_cpu_partial(s, nr_objects);
5141 #endif
5142 }
5143
5144 /*
5145  * calculate_sizes() determines the order and the distribution of data within
5146  * a slab object.
5147  */
5148 static int calculate_sizes(struct kmem_cache *s)
5149 {
5150         slab_flags_t flags = s->flags;
5151         unsigned int size = s->object_size;
5152         unsigned int order;
5153
5154         /*
5155          * Round up object size to the next word boundary. We can only
5156          * place the free pointer at word boundaries and this determines
5157          * the possible location of the free pointer.
5158          */
5159         size = ALIGN(size, sizeof(void *));
5160
5161 #ifdef CONFIG_SLUB_DEBUG
5162         /*
5163          * Determine if we can poison the object itself. If the user of
5164          * the slab may touch the object after free or before allocation
5165          * then we should never poison the object itself.
5166          */
5167         if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
5168                         !s->ctor)
5169                 s->flags |= __OBJECT_POISON;
5170         else
5171                 s->flags &= ~__OBJECT_POISON;
5172
5173
5174         /*
5175          * If we are Redzoning then check if there is some space between the
5176          * end of the object and the free pointer. If not then add an
5177          * additional word to have some bytes to store Redzone information.
5178          */
5179         if ((flags & SLAB_RED_ZONE) && size == s->object_size)
5180                 size += sizeof(void *);
5181 #endif
5182
5183         /*
5184          * With that we have determined the number of bytes in actual use
5185          * by the object and redzoning.
5186          */
5187         s->inuse = size;
5188
5189         if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor ||
5190             ((flags & SLAB_RED_ZONE) &&
5191              (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
5192                 /*
5193                  * Relocate free pointer after the object if it is not
5194                  * permitted to overwrite the first word of the object on
5195                  * kmem_cache_free.
5196                  *
5197                  * This is the case if we do RCU, have a constructor or
5198                  * destructor, are poisoning the objects, or are
5199                  * redzoning an object smaller than sizeof(void *) or are
5200                  * redzoning an object with slub_debug_orig_size() enabled,
5201                  * in which case the right redzone may be extended.
5202                  *
5203                  * The assumption that s->offset >= s->inuse means free
5204                  * pointer is outside of the object is used in the
5205                  * freeptr_outside_object() function. If that is no
5206                  * longer true, the function needs to be modified.
5207                  */
5208                 s->offset = size;
5209                 size += sizeof(void *);
5210         } else {
5211                 /*
5212                  * Store freelist pointer near middle of object to keep
5213                  * it away from the edges of the object to avoid small
5214                  * sized over/underflows from neighboring allocations.
5215                  */
5216                 s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
5217         }
5218
5219 #ifdef CONFIG_SLUB_DEBUG
5220         if (flags & SLAB_STORE_USER) {
5221                 /*
5222                  * Need to store information about allocs and frees after
5223                  * the object.
5224                  */
5225                 size += 2 * sizeof(struct track);
5226
5227                 /* Save the original kmalloc request size */
5228                 if (flags & SLAB_KMALLOC)
5229                         size += sizeof(unsigned int);
5230         }
5231 #endif
5232
5233         kasan_cache_create(s, &size, &s->flags);
5234 #ifdef CONFIG_SLUB_DEBUG
5235         if (flags & SLAB_RED_ZONE) {
5236                 /*
5237                  * Add some empty padding so that we can catch
5238                  * overwrites from earlier objects rather than let
5239                  * tracking information or the free pointer be
5240                  * corrupted if a user writes before the start
5241                  * of the object.
5242                  */
5243                 size += sizeof(void *);
5244
5245                 s->red_left_pad = sizeof(void *);
5246                 s->red_left_pad = ALIGN(s->red_left_pad, s->align);
5247                 size += s->red_left_pad;
5248         }
5249 #endif
5250
5251         /*
5252          * SLUB stores one object immediately after another beginning from
5253          * offset 0. In order to align the objects we have to simply size
5254          * each object to conform to the alignment.
5255          */
5256         size = ALIGN(size, s->align);
5257         s->size = size;
5258         s->reciprocal_size = reciprocal_value(size);
5259         order = calculate_order(size);
5260
5261         if ((int)order < 0)
5262                 return 0;
5263
5264         s->allocflags = __GFP_COMP;
5265
5266         if (s->flags & SLAB_CACHE_DMA)
5267                 s->allocflags |= GFP_DMA;
5268
5269         if (s->flags & SLAB_CACHE_DMA32)
5270                 s->allocflags |= GFP_DMA32;
5271
5272         if (s->flags & SLAB_RECLAIM_ACCOUNT)
5273                 s->allocflags |= __GFP_RECLAIMABLE;
5274
5275         /*
5276          * Determine the number of objects per slab
5277          */
5278         s->oo = oo_make(order, size);
5279         s->min = oo_make(get_order(size), size);
5280
5281         return !!oo_objects(s->oo);
5282 }
5283
5284 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
5285 {
5286         s->flags = kmem_cache_flags(flags, s->name);
5287 #ifdef CONFIG_SLAB_FREELIST_HARDENED
5288         s->random = get_random_long();
5289 #endif
5290
5291         if (!calculate_sizes(s))
5292                 goto error;
5293         if (disable_higher_order_debug) {
5294                 /*
5295                  * Disable debugging flags that store metadata if the min slab
5296                  * order increased.
5297                  */
5298                 if (get_order(s->size) > get_order(s->object_size)) {
5299                         s->flags &= ~DEBUG_METADATA_FLAGS;
5300                         s->offset = 0;
5301                         if (!calculate_sizes(s))
5302                                 goto error;
5303                 }
5304         }
5305
5306 #ifdef system_has_freelist_aba
5307         if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
5308                 /* Enable fast mode */
5309                 s->flags |= __CMPXCHG_DOUBLE;
5310         }
5311 #endif
5312
5313         /*
5314          * The larger the object size is, the more slabs we want on the partial
5315          * list to avoid pounding the page allocator excessively.
5316          */
5317         s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
5318         s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
5319
5320         set_cpu_partial(s);
5321
5322 #ifdef CONFIG_NUMA
5323         s->remote_node_defrag_ratio = 1000;
5324 #endif
5325
5326         /* Initialize the pre-computed randomized freelist if slab is up */
5327         if (slab_state >= UP) {
5328                 if (init_cache_random_seq(s))
5329                         goto error;
5330         }
5331
5332         if (!init_kmem_cache_nodes(s))
5333                 goto error;
5334
5335         if (alloc_kmem_cache_cpus(s))
5336                 return 0;
5337
5338 error:
5339         __kmem_cache_release(s);
5340         return -EINVAL;
5341 }
5342
5343 static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
5344                               const char *text)
5345 {
5346 #ifdef CONFIG_SLUB_DEBUG
5347         void *addr = slab_address(slab);
5348         void *p;
5349
5350         slab_err(s, slab, text, s->name);
5351
5352         spin_lock(&object_map_lock);
5353         __fill_map(object_map, s, slab);
5354
5355         for_each_object(p, s, addr, slab->objects) {
5356
5357                 if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
5358                         pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
5359                         print_tracking(s, p);
5360                 }
5361         }
5362         spin_unlock(&object_map_lock);
5363 #endif
5364 }
5365
5366 /*
5367  * Attempt to free all partial slabs on a node.
5368  * This is called from __kmem_cache_shutdown(). We must take list_lock
5369  * because sysfs file might still access partial list after the shutdowning.
5370  */
5371 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
5372 {
5373         LIST_HEAD(discard);
5374         struct slab *slab, *h;
5375
5376         BUG_ON(irqs_disabled());
5377         spin_lock_irq(&n->list_lock);
5378         list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
5379                 if (!slab->inuse) {
5380                         remove_partial(n, slab);
5381                         list_add(&slab->slab_list, &discard);
5382                 } else {
5383                         list_slab_objects(s, slab,
5384                           "Objects remaining in %s on __kmem_cache_shutdown()");
5385                 }
5386         }
5387         spin_unlock_irq(&n->list_lock);
5388
5389         list_for_each_entry_safe(slab, h, &discard, slab_list)
5390                 discard_slab(s, slab);
5391 }
5392
5393 bool __kmem_cache_empty(struct kmem_cache *s)
5394 {
5395         int node;
5396         struct kmem_cache_node *n;
5397
5398         for_each_kmem_cache_node(s, node, n)
5399                 if (n->nr_partial || node_nr_slabs(n))
5400                         return false;
5401         return true;
5402 }
5403
5404 /*
5405  * Release all resources used by a slab cache.
5406  */
5407 int __kmem_cache_shutdown(struct kmem_cache *s)
5408 {
5409         int node;
5410         struct kmem_cache_node *n;
5411
5412         flush_all_cpus_locked(s);
5413         /* Attempt to free all objects */
5414         for_each_kmem_cache_node(s, node, n) {
5415                 free_partial(s, n);
5416                 if (n->nr_partial || node_nr_slabs(n))
5417                         return 1;
5418         }
5419         return 0;
5420 }
5421
5422 #ifdef CONFIG_PRINTK
5423 void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
5424 {
5425         void *base;
5426         int __maybe_unused i;
5427         unsigned int objnr;
5428         void *objp;
5429         void *objp0;
5430         struct kmem_cache *s = slab->slab_cache;
5431         struct track __maybe_unused *trackp;
5432
5433         kpp->kp_ptr = object;
5434         kpp->kp_slab = slab;
5435         kpp->kp_slab_cache = s;
5436         base = slab_address(slab);
5437         objp0 = kasan_reset_tag(object);
5438 #ifdef CONFIG_SLUB_DEBUG
5439         objp = restore_red_left(s, objp0);
5440 #else
5441         objp = objp0;
5442 #endif
5443         objnr = obj_to_index(s, slab, objp);
5444         kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
5445         objp = base + s->size * objnr;
5446         kpp->kp_objp = objp;
5447         if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
5448                          || (objp - base) % s->size) ||
5449             !(s->flags & SLAB_STORE_USER))
5450                 return;
5451 #ifdef CONFIG_SLUB_DEBUG
5452         objp = fixup_red_left(s, objp);
5453         trackp = get_track(s, objp, TRACK_ALLOC);
5454         kpp->kp_ret = (void *)trackp->addr;
5455 #ifdef CONFIG_STACKDEPOT
5456         {
5457                 depot_stack_handle_t handle;
5458                 unsigned long *entries;
5459                 unsigned int nr_entries;
5460
5461                 handle = READ_ONCE(trackp->handle);
5462                 if (handle) {
5463                         nr_entries = stack_depot_fetch(handle, &entries);
5464                         for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
5465                                 kpp->kp_stack[i] = (void *)entries[i];
5466                 }
5467
5468                 trackp = get_track(s, objp, TRACK_FREE);
5469                 handle = READ_ONCE(trackp->handle);
5470                 if (handle) {
5471                         nr_entries = stack_depot_fetch(handle, &entries);
5472                         for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
5473                                 kpp->kp_free_stack[i] = (void *)entries[i];
5474                 }
5475         }
5476 #endif
5477 #endif
5478 }
5479 #endif
5480
5481 /********************************************************************
5482  *              Kmalloc subsystem
5483  *******************************************************************/
5484
5485 static int __init setup_slub_min_order(char *str)
5486 {
5487         get_option(&str, (int *)&slub_min_order);
5488
5489         if (slub_min_order > slub_max_order)
5490                 slub_max_order = slub_min_order;
5491
5492         return 1;
5493 }
5494
5495 __setup("slab_min_order=", setup_slub_min_order);
5496 __setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
5497
5498
5499 static int __init setup_slub_max_order(char *str)
5500 {
5501         get_option(&str, (int *)&slub_max_order);
5502         slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
5503
5504         if (slub_min_order > slub_max_order)
5505                 slub_min_order = slub_max_order;
5506
5507         return 1;
5508 }
5509
5510 __setup("slab_max_order=", setup_slub_max_order);
5511 __setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
5512
5513 static int __init setup_slub_min_objects(char *str)
5514 {
5515         get_option(&str, (int *)&slub_min_objects);
5516
5517         return 1;
5518 }
5519
5520 __setup("slab_min_objects=", setup_slub_min_objects);
5521 __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
5522
5523 #ifdef CONFIG_HARDENED_USERCOPY
5524 /*
5525  * Rejects incorrectly sized objects and objects that are to be copied
5526  * to/from userspace but do not fall entirely within the containing slab
5527  * cache's usercopy region.
5528  *
5529  * Returns NULL if check passes, otherwise const char * to name of cache
5530  * to indicate an error.
5531  */
5532 void __check_heap_object(const void *ptr, unsigned long n,
5533                          const struct slab *slab, bool to_user)
5534 {
5535         struct kmem_cache *s;
5536         unsigned int offset;
5537         bool is_kfence = is_kfence_address(ptr);
5538
5539         ptr = kasan_reset_tag(ptr);
5540
5541         /* Find object and usable object size. */
5542         s = slab->slab_cache;
5543
5544         /* Reject impossible pointers. */
5545         if (ptr < slab_address(slab))
5546                 usercopy_abort("SLUB object not in SLUB page?!", NULL,
5547                                to_user, 0, n);
5548
5549         /* Find offset within object. */
5550         if (is_kfence)
5551                 offset = ptr - kfence_object_start(ptr);
5552         else
5553                 offset = (ptr - slab_address(slab)) % s->size;
5554
5555         /* Adjust for redzone and reject if within the redzone. */
5556         if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
5557                 if (offset < s->red_left_pad)
5558                         usercopy_abort("SLUB object in left red zone",
5559                                        s->name, to_user, offset, n);
5560                 offset -= s->red_left_pad;
5561         }
5562
5563         /* Allow address range falling entirely within usercopy region. */
5564         if (offset >= s->useroffset &&
5565             offset - s->useroffset <= s->usersize &&
5566             n <= s->useroffset - offset + s->usersize)
5567                 return;
5568
5569         usercopy_abort("SLUB object", s->name, to_user, offset, n);
5570 }
5571 #endif /* CONFIG_HARDENED_USERCOPY */
5572
5573 #define SHRINK_PROMOTE_MAX 32
5574
5575 /*
5576  * kmem_cache_shrink discards empty slabs and promotes the slabs filled
5577  * up most to the head of the partial lists. New allocations will then
5578  * fill those up and thus they can be removed from the partial lists.
5579  *
5580  * The slabs with the least items are placed last. This results in them
5581  * being allocated from last increasing the chance that the last objects
5582  * are freed in them.
5583  */
5584 static int __kmem_cache_do_shrink(struct kmem_cache *s)
5585 {
5586         int node;
5587         int i;
5588         struct kmem_cache_node *n;
5589         struct slab *slab;
5590         struct slab *t;
5591         struct list_head discard;
5592         struct list_head promote[SHRINK_PROMOTE_MAX];
5593         unsigned long flags;
5594         int ret = 0;
5595
5596         for_each_kmem_cache_node(s, node, n) {
5597                 INIT_LIST_HEAD(&discard);
5598                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
5599                         INIT_LIST_HEAD(promote + i);
5600
5601                 spin_lock_irqsave(&n->list_lock, flags);
5602
5603                 /*
5604                  * Build lists of slabs to discard or promote.
5605                  *
5606                  * Note that concurrent frees may occur while we hold the
5607                  * list_lock. slab->inuse here is the upper limit.
5608                  */
5609                 list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
5610                         int free = slab->objects - slab->inuse;
5611
5612                         /* Do not reread slab->inuse */
5613                         barrier();
5614
5615                         /* We do not keep full slabs on the list */
5616                         BUG_ON(free <= 0);
5617
5618                         if (free == slab->objects) {
5619                                 list_move(&slab->slab_list, &discard);
5620                                 slab_clear_node_partial(slab);
5621                                 n->nr_partial--;
5622                                 dec_slabs_node(s, node, slab->objects);
5623                         } else if (free <= SHRINK_PROMOTE_MAX)
5624                                 list_move(&slab->slab_list, promote + free - 1);
5625                 }
5626
5627                 /*
5628                  * Promote the slabs filled up most to the head of the
5629                  * partial list.
5630                  */
5631                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
5632                         list_splice(promote + i, &n->partial);
5633
5634                 spin_unlock_irqrestore(&n->list_lock, flags);
5635
5636                 /* Release empty slabs */
5637                 list_for_each_entry_safe(slab, t, &discard, slab_list)
5638                         free_slab(s, slab);
5639
5640                 if (node_nr_slabs(n))
5641                         ret = 1;
5642         }
5643
5644         return ret;
5645 }
5646
5647 int __kmem_cache_shrink(struct kmem_cache *s)
5648 {
5649         flush_all(s);
5650         return __kmem_cache_do_shrink(s);
5651 }
5652
5653 static int slab_mem_going_offline_callback(void *arg)
5654 {
5655         struct kmem_cache *s;
5656
5657         mutex_lock(&slab_mutex);
5658         list_for_each_entry(s, &slab_caches, list) {
5659                 flush_all_cpus_locked(s);
5660                 __kmem_cache_do_shrink(s);
5661         }
5662         mutex_unlock(&slab_mutex);
5663
5664         return 0;
5665 }
5666
5667 static void slab_mem_offline_callback(void *arg)
5668 {
5669         struct memory_notify *marg = arg;
5670         int offline_node;
5671
5672         offline_node = marg->status_change_nid_normal;
5673
5674         /*
5675          * If the node still has available memory. we need kmem_cache_node
5676          * for it yet.
5677          */
5678         if (offline_node < 0)
5679                 return;
5680
5681         mutex_lock(&slab_mutex);
5682         node_clear(offline_node, slab_nodes);
5683         /*
5684          * We no longer free kmem_cache_node structures here, as it would be
5685          * racy with all get_node() users, and infeasible to protect them with
5686          * slab_mutex.
5687          */
5688         mutex_unlock(&slab_mutex);
5689 }
5690
5691 static int slab_mem_going_online_callback(void *arg)
5692 {
5693         struct kmem_cache_node *n;
5694         struct kmem_cache *s;
5695         struct memory_notify *marg = arg;
5696         int nid = marg->status_change_nid_normal;
5697         int ret = 0;
5698
5699         /*
5700          * If the node's memory is already available, then kmem_cache_node is
5701          * already created. Nothing to do.
5702          */
5703         if (nid < 0)
5704                 return 0;
5705
5706         /*
5707          * We are bringing a node online. No memory is available yet. We must
5708          * allocate a kmem_cache_node structure in order to bring the node
5709          * online.
5710          */
5711         mutex_lock(&slab_mutex);
5712         list_for_each_entry(s, &slab_caches, list) {
5713                 /*
5714                  * The structure may already exist if the node was previously
5715                  * onlined and offlined.
5716                  */
5717                 if (get_node(s, nid))
5718                         continue;
5719                 /*
5720                  * XXX: kmem_cache_alloc_node will fallback to other nodes
5721                  *      since memory is not yet available from the node that
5722                  *      is brought up.
5723                  */
5724                 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
5725                 if (!n) {
5726                         ret = -ENOMEM;
5727                         goto out;
5728                 }
5729                 init_kmem_cache_node(n);
5730                 s->node[nid] = n;
5731         }
5732         /*
5733          * Any cache created after this point will also have kmem_cache_node
5734          * initialized for the new node.
5735          */
5736         node_set(nid, slab_nodes);
5737 out:
5738         mutex_unlock(&slab_mutex);
5739         return ret;
5740 }
5741
5742 static int slab_memory_callback(struct notifier_block *self,
5743                                 unsigned long action, void *arg)
5744 {
5745         int ret = 0;
5746
5747         switch (action) {
5748         case MEM_GOING_ONLINE:
5749                 ret = slab_mem_going_online_callback(arg);
5750                 break;
5751         case MEM_GOING_OFFLINE:
5752                 ret = slab_mem_going_offline_callback(arg);
5753                 break;
5754         case MEM_OFFLINE:
5755         case MEM_CANCEL_ONLINE:
5756                 slab_mem_offline_callback(arg);
5757                 break;
5758         case MEM_ONLINE:
5759         case MEM_CANCEL_OFFLINE:
5760                 break;
5761         }
5762         if (ret)
5763                 ret = notifier_from_errno(ret);
5764         else
5765                 ret = NOTIFY_OK;
5766         return ret;
5767 }
5768
5769 /********************************************************************
5770  *                      Basic setup of slabs
5771  *******************************************************************/
5772
5773 /*
5774  * Used for early kmem_cache structures that were allocated using
5775  * the page allocator. Allocate them properly then fix up the pointers
5776  * that may be pointing to the wrong kmem_cache structure.
5777  */
5778
5779 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
5780 {
5781         int node;
5782         struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
5783         struct kmem_cache_node *n;
5784
5785         memcpy(s, static_cache, kmem_cache->object_size);
5786
5787         /*
5788          * This runs very early, and only the boot processor is supposed to be
5789          * up.  Even if it weren't true, IRQs are not up so we couldn't fire
5790          * IPIs around.
5791          */
5792         __flush_cpu_slab(s, smp_processor_id());
5793         for_each_kmem_cache_node(s, node, n) {
5794                 struct slab *p;
5795
5796                 list_for_each_entry(p, &n->partial, slab_list)
5797                         p->slab_cache = s;
5798
5799 #ifdef CONFIG_SLUB_DEBUG
5800                 list_for_each_entry(p, &n->full, slab_list)
5801                         p->slab_cache = s;
5802 #endif
5803         }
5804         list_add(&s->list, &slab_caches);
5805         return s;
5806 }
5807
5808 void __init kmem_cache_init(void)
5809 {
5810         static __initdata struct kmem_cache boot_kmem_cache,
5811                 boot_kmem_cache_node;
5812         int node;
5813
5814         if (debug_guardpage_minorder())
5815                 slub_max_order = 0;
5816
5817         /* Print slub debugging pointers without hashing */
5818         if (__slub_debug_enabled())
5819                 no_hash_pointers_enable(NULL);
5820
5821         kmem_cache_node = &boot_kmem_cache_node;
5822         kmem_cache = &boot_kmem_cache;
5823
5824         /*
5825          * Initialize the nodemask for which we will allocate per node
5826          * structures. Here we don't need taking slab_mutex yet.
5827          */
5828         for_each_node_state(node, N_NORMAL_MEMORY)
5829                 node_set(node, slab_nodes);
5830
5831         create_boot_cache(kmem_cache_node, "kmem_cache_node",
5832                         sizeof(struct kmem_cache_node),
5833                         SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
5834
5835         hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
5836
5837         /* Able to allocate the per node structures */
5838         slab_state = PARTIAL;
5839
5840         create_boot_cache(kmem_cache, "kmem_cache",
5841                         offsetof(struct kmem_cache, node) +
5842                                 nr_node_ids * sizeof(struct kmem_cache_node *),
5843                         SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
5844
5845         kmem_cache = bootstrap(&boot_kmem_cache);
5846         kmem_cache_node = bootstrap(&boot_kmem_cache_node);
5847
5848         /* Now we can use the kmem_cache to allocate kmalloc slabs */
5849         setup_kmalloc_cache_index_table();
5850         create_kmalloc_caches();
5851
5852         /* Setup random freelists for each cache */
5853         init_freelist_randomization();
5854
5855         cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
5856                                   slub_cpu_dead);
5857
5858         pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
5859                 cache_line_size(),
5860                 slub_min_order, slub_max_order, slub_min_objects,
5861                 nr_cpu_ids, nr_node_ids);
5862 }
5863
5864 void __init kmem_cache_init_late(void)
5865 {
5866 #ifndef CONFIG_SLUB_TINY
5867         flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
5868         WARN_ON(!flushwq);
5869 #endif
5870 }
5871
5872 struct kmem_cache *
5873 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
5874                    slab_flags_t flags, void (*ctor)(void *))
5875 {
5876         struct kmem_cache *s;
5877
5878         s = find_mergeable(size, align, flags, name, ctor);
5879         if (s) {
5880                 if (sysfs_slab_alias(s, name))
5881                         return NULL;
5882
5883                 s->refcount++;
5884
5885                 /*
5886                  * Adjust the object sizes so that we clear
5887                  * the complete object on kzalloc.
5888                  */
5889                 s->object_size = max(s->object_size, size);
5890                 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
5891         }
5892
5893         return s;
5894 }
5895
5896 int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
5897 {
5898         int err;
5899
5900         err = kmem_cache_open(s, flags);
5901         if (err)
5902                 return err;
5903
5904         /* Mutex is not taken during early boot */
5905         if (slab_state <= UP)
5906                 return 0;
5907
5908         err = sysfs_slab_add(s);
5909         if (err) {
5910                 __kmem_cache_release(s);
5911                 return err;
5912         }
5913
5914         if (s->flags & SLAB_STORE_USER)
5915                 debugfs_slab_add(s);
5916
5917         return 0;
5918 }
5919
5920 #ifdef SLAB_SUPPORTS_SYSFS
5921 static int count_inuse(struct slab *slab)
5922 {
5923         return slab->inuse;
5924 }
5925
5926 static int count_total(struct slab *slab)
5927 {
5928         return slab->objects;
5929 }
5930 #endif
5931
5932 #ifdef CONFIG_SLUB_DEBUG
5933 static void validate_slab(struct kmem_cache *s, struct slab *slab,
5934                           unsigned long *obj_map)
5935 {
5936         void *p;
5937         void *addr = slab_address(slab);
5938
5939         if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
5940                 return;
5941
5942         /* Now we know that a valid freelist exists */
5943         __fill_map(obj_map, s, slab);
5944         for_each_object(p, s, addr, slab->objects) {
5945                 u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
5946                          SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
5947
5948                 if (!check_object(s, slab, p, val))
5949                         break;
5950         }
5951 }
5952
5953 static int validate_slab_node(struct kmem_cache *s,
5954                 struct kmem_cache_node *n, unsigned long *obj_map)
5955 {
5956         unsigned long count = 0;
5957         struct slab *slab;
5958         unsigned long flags;
5959
5960         spin_lock_irqsave(&n->list_lock, flags);
5961
5962         list_for_each_entry(slab, &n->partial, slab_list) {
5963                 validate_slab(s, slab, obj_map);
5964                 count++;
5965         }
5966         if (count != n->nr_partial) {
5967                 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
5968                        s->name, count, n->nr_partial);
5969                 slab_add_kunit_errors();
5970         }
5971
5972         if (!(s->flags & SLAB_STORE_USER))
5973                 goto out;
5974
5975         list_for_each_entry(slab, &n->full, slab_list) {
5976                 validate_slab(s, slab, obj_map);
5977                 count++;
5978         }
5979         if (count != node_nr_slabs(n)) {
5980                 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
5981                        s->name, count, node_nr_slabs(n));
5982                 slab_add_kunit_errors();
5983         }
5984
5985 out:
5986         spin_unlock_irqrestore(&n->list_lock, flags);
5987         return count;
5988 }
5989
5990 long validate_slab_cache(struct kmem_cache *s)
5991 {
5992         int node;
5993         unsigned long count = 0;
5994         struct kmem_cache_node *n;
5995         unsigned long *obj_map;
5996
5997         obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
5998         if (!obj_map)
5999                 return -ENOMEM;
6000
6001         flush_all(s);
6002         for_each_kmem_cache_node(s, node, n)
6003                 count += validate_slab_node(s, n, obj_map);
6004
6005         bitmap_free(obj_map);
6006
6007         return count;
6008 }
6009 EXPORT_SYMBOL(validate_slab_cache);
6010
6011 #ifdef CONFIG_DEBUG_FS
6012 /*
6013  * Generate lists of code addresses where slabcache objects are allocated
6014  * and freed.
6015  */
6016
6017 struct location {
6018         depot_stack_handle_t handle;
6019         unsigned long count;
6020         unsigned long addr;
6021         unsigned long waste;
6022         long long sum_time;
6023         long min_time;
6024         long max_time;
6025         long min_pid;
6026         long max_pid;
6027         DECLARE_BITMAP(cpus, NR_CPUS);
6028         nodemask_t nodes;
6029 };
6030
6031 struct loc_track {
6032         unsigned long max;
6033         unsigned long count;
6034         struct location *loc;
6035         loff_t idx;
6036 };
6037
6038 static struct dentry *slab_debugfs_root;
6039
6040 static void free_loc_track(struct loc_track *t)
6041 {
6042         if (t->max)
6043                 free_pages((unsigned long)t->loc,
6044                         get_order(sizeof(struct location) * t->max));
6045 }
6046
6047 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
6048 {
6049         struct location *l;
6050         int order;
6051
6052         order = get_order(sizeof(struct location) * max);
6053
6054         l = (void *)__get_free_pages(flags, order);
6055         if (!l)
6056                 return 0;
6057
6058         if (t->count) {
6059                 memcpy(l, t->loc, sizeof(struct location) * t->count);
6060                 free_loc_track(t);
6061         }
6062         t->max = max;
6063         t->loc = l;
6064         return 1;
6065 }
6066
6067 static int add_location(struct loc_track *t, struct kmem_cache *s,
6068                                 const struct track *track,
6069                                 unsigned int orig_size)
6070 {
6071         long start, end, pos;
6072         struct location *l;
6073         unsigned long caddr, chandle, cwaste;
6074         unsigned long age = jiffies - track->when;
6075         depot_stack_handle_t handle = 0;
6076         unsigned int waste = s->object_size - orig_size;
6077
6078 #ifdef CONFIG_STACKDEPOT
6079         handle = READ_ONCE(track->handle);
6080 #endif
6081         start = -1;
6082         end = t->count;
6083
6084         for ( ; ; ) {
6085                 pos = start + (end - start + 1) / 2;
6086
6087                 /*
6088                  * There is nothing at "end". If we end up there
6089                  * we need to add something to before end.
6090                  */
6091                 if (pos == end)
6092                         break;
6093
6094                 l = &t->loc[pos];
6095                 caddr = l->addr;
6096                 chandle = l->handle;
6097                 cwaste = l->waste;
6098                 if ((track->addr == caddr) && (handle == chandle) &&
6099                         (waste == cwaste)) {
6100
6101                         l->count++;
6102                         if (track->when) {
6103                                 l->sum_time += age;
6104                                 if (age < l->min_time)
6105                                         l->min_time = age;
6106                                 if (age > l->max_time)
6107                                         l->max_time = age;
6108
6109                                 if (track->pid < l->min_pid)
6110                                         l->min_pid = track->pid;
6111                                 if (track->pid > l->max_pid)
6112                                         l->max_pid = track->pid;
6113
6114                                 cpumask_set_cpu(track->cpu,
6115                                                 to_cpumask(l->cpus));
6116                         }
6117                         node_set(page_to_nid(virt_to_page(track)), l->nodes);
6118                         return 1;
6119                 }
6120
6121                 if (track->addr < caddr)
6122                         end = pos;
6123                 else if (track->addr == caddr && handle < chandle)
6124                         end = pos;
6125                 else if (track->addr == caddr && handle == chandle &&
6126                                 waste < cwaste)
6127                         end = pos;
6128                 else
6129                         start = pos;
6130         }
6131
6132         /*
6133          * Not found. Insert new tracking element.
6134          */
6135         if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
6136                 return 0;
6137
6138         l = t->loc + pos;
6139         if (pos < t->count)
6140                 memmove(l + 1, l,
6141                         (t->count - pos) * sizeof(struct location));
6142         t->count++;
6143         l->count = 1;
6144         l->addr = track->addr;
6145         l->sum_time = age;
6146         l->min_time = age;
6147         l->max_time = age;
6148         l->min_pid = track->pid;
6149         l->max_pid = track->pid;
6150         l->handle = handle;
6151         l->waste = waste;
6152         cpumask_clear(to_cpumask(l->cpus));
6153         cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
6154         nodes_clear(l->nodes);
6155         node_set(page_to_nid(virt_to_page(track)), l->nodes);
6156         return 1;
6157 }
6158
6159 static void process_slab(struct loc_track *t, struct kmem_cache *s,
6160                 struct slab *slab, enum track_item alloc,
6161                 unsigned long *obj_map)
6162 {
6163         void *addr = slab_address(slab);
6164         bool is_alloc = (alloc == TRACK_ALLOC);
6165         void *p;
6166
6167         __fill_map(obj_map, s, slab);
6168
6169         for_each_object(p, s, addr, slab->objects)
6170                 if (!test_bit(__obj_to_index(s, addr, p), obj_map))
6171                         add_location(t, s, get_track(s, p, alloc),
6172                                      is_alloc ? get_orig_size(s, p) :
6173                                                 s->object_size);
6174 }
6175 #endif  /* CONFIG_DEBUG_FS   */
6176 #endif  /* CONFIG_SLUB_DEBUG */
6177
6178 #ifdef SLAB_SUPPORTS_SYSFS
6179 enum slab_stat_type {
6180         SL_ALL,                 /* All slabs */
6181         SL_PARTIAL,             /* Only partially allocated slabs */
6182         SL_CPU,                 /* Only slabs used for cpu caches */
6183         SL_OBJECTS,             /* Determine allocated objects not slabs */
6184         SL_TOTAL                /* Determine object capacity not slabs */
6185 };
6186
6187 #define SO_ALL          (1 << SL_ALL)
6188 #define SO_PARTIAL      (1 << SL_PARTIAL)
6189 #define SO_CPU          (1 << SL_CPU)
6190 #define SO_OBJECTS      (1 << SL_OBJECTS)
6191 #define SO_TOTAL        (1 << SL_TOTAL)
6192
6193 static ssize_t show_slab_objects(struct kmem_cache *s,
6194                                  char *buf, unsigned long flags)
6195 {
6196         unsigned long total = 0;
6197         int node;
6198         int x;
6199         unsigned long *nodes;
6200         int len = 0;
6201
6202         nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
6203         if (!nodes)
6204                 return -ENOMEM;
6205
6206         if (flags & SO_CPU) {
6207                 int cpu;
6208
6209                 for_each_possible_cpu(cpu) {
6210                         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
6211                                                                cpu);
6212                         int node;
6213                         struct slab *slab;
6214
6215                         slab = READ_ONCE(c->slab);
6216                         if (!slab)
6217                                 continue;
6218
6219                         node = slab_nid(slab);
6220                         if (flags & SO_TOTAL)
6221                                 x = slab->objects;
6222                         else if (flags & SO_OBJECTS)
6223                                 x = slab->inuse;
6224                         else
6225                                 x = 1;
6226
6227                         total += x;
6228                         nodes[node] += x;
6229
6230 #ifdef CONFIG_SLUB_CPU_PARTIAL
6231                         slab = slub_percpu_partial_read_once(c);
6232                         if (slab) {
6233                                 node = slab_nid(slab);
6234                                 if (flags & SO_TOTAL)
6235                                         WARN_ON_ONCE(1);
6236                                 else if (flags & SO_OBJECTS)
6237                                         WARN_ON_ONCE(1);
6238                                 else
6239                                         x = data_race(slab->slabs);
6240                                 total += x;
6241                                 nodes[node] += x;
6242                         }
6243 #endif
6244                 }
6245         }
6246
6247         /*
6248          * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
6249          * already held which will conflict with an existing lock order:
6250          *
6251          * mem_hotplug_lock->slab_mutex->kernfs_mutex
6252          *
6253          * We don't really need mem_hotplug_lock (to hold off
6254          * slab_mem_going_offline_callback) here because slab's memory hot
6255          * unplug code doesn't destroy the kmem_cache->node[] data.
6256          */
6257
6258 #ifdef CONFIG_SLUB_DEBUG
6259         if (flags & SO_ALL) {
6260                 struct kmem_cache_node *n;
6261
6262                 for_each_kmem_cache_node(s, node, n) {
6263
6264                         if (flags & SO_TOTAL)
6265                                 x = node_nr_objs(n);
6266                         else if (flags & SO_OBJECTS)
6267                                 x = node_nr_objs(n) - count_partial(n, count_free);
6268                         else
6269                                 x = node_nr_slabs(n);
6270                         total += x;
6271                         nodes[node] += x;
6272                 }
6273
6274         } else
6275 #endif
6276         if (flags & SO_PARTIAL) {
6277                 struct kmem_cache_node *n;
6278
6279                 for_each_kmem_cache_node(s, node, n) {
6280                         if (flags & SO_TOTAL)
6281                                 x = count_partial(n, count_total);
6282                         else if (flags & SO_OBJECTS)
6283                                 x = count_partial(n, count_inuse);
6284                         else
6285                                 x = n->nr_partial;
6286                         total += x;
6287                         nodes[node] += x;
6288                 }
6289         }
6290
6291         len += sysfs_emit_at(buf, len, "%lu", total);
6292 #ifdef CONFIG_NUMA
6293         for (node = 0; node < nr_node_ids; node++) {
6294                 if (nodes[node])
6295                         len += sysfs_emit_at(buf, len, " N%d=%lu",
6296                                              node, nodes[node]);
6297         }
6298 #endif
6299         len += sysfs_emit_at(buf, len, "\n");
6300         kfree(nodes);
6301
6302         return len;
6303 }
6304
6305 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
6306 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
6307
6308 struct slab_attribute {
6309         struct attribute attr;
6310         ssize_t (*show)(struct kmem_cache *s, char *buf);
6311         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
6312 };
6313
6314 #define SLAB_ATTR_RO(_name) \
6315         static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
6316
6317 #define SLAB_ATTR(_name) \
6318         static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
6319
6320 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
6321 {
6322         return sysfs_emit(buf, "%u\n", s->size);
6323 }
6324 SLAB_ATTR_RO(slab_size);
6325
6326 static ssize_t align_show(struct kmem_cache *s, char *buf)
6327 {
6328         return sysfs_emit(buf, "%u\n", s->align);
6329 }
6330 SLAB_ATTR_RO(align);
6331
6332 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
6333 {
6334         return sysfs_emit(buf, "%u\n", s->object_size);
6335 }
6336 SLAB_ATTR_RO(object_size);
6337
6338 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
6339 {
6340         return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
6341 }
6342 SLAB_ATTR_RO(objs_per_slab);
6343
6344 static ssize_t order_show(struct kmem_cache *s, char *buf)
6345 {
6346         return sysfs_emit(buf, "%u\n", oo_order(s->oo));
6347 }
6348 SLAB_ATTR_RO(order);
6349
6350 static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
6351 {
6352         return sysfs_emit(buf, "%lu\n", s->min_partial);
6353 }
6354
6355 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
6356                                  size_t length)
6357 {
6358         unsigned long min;
6359         int err;
6360
6361         err = kstrtoul(buf, 10, &min);
6362         if (err)
6363                 return err;
6364
6365         s->min_partial = min;
6366         return length;
6367 }
6368 SLAB_ATTR(min_partial);
6369
6370 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
6371 {
6372         unsigned int nr_partial = 0;
6373 #ifdef CONFIG_SLUB_CPU_PARTIAL
6374         nr_partial = s->cpu_partial;
6375 #endif
6376
6377         return sysfs_emit(buf, "%u\n", nr_partial);
6378 }
6379
6380 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
6381                                  size_t length)
6382 {
6383         unsigned int objects;
6384         int err;
6385
6386         err = kstrtouint(buf, 10, &objects);
6387         if (err)
6388                 return err;
6389         if (objects && !kmem_cache_has_cpu_partial(s))
6390                 return -EINVAL;
6391
6392         slub_set_cpu_partial(s, objects);
6393         flush_all(s);
6394         return length;
6395 }
6396 SLAB_ATTR(cpu_partial);
6397
6398 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
6399 {
6400         if (!s->ctor)
6401                 return 0;
6402         return sysfs_emit(buf, "%pS\n", s->ctor);
6403 }
6404 SLAB_ATTR_RO(ctor);
6405
6406 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
6407 {
6408         return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
6409 }
6410 SLAB_ATTR_RO(aliases);
6411
6412 static ssize_t partial_show(struct kmem_cache *s, char *buf)
6413 {
6414         return show_slab_objects(s, buf, SO_PARTIAL);
6415 }
6416 SLAB_ATTR_RO(partial);
6417
6418 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
6419 {
6420         return show_slab_objects(s, buf, SO_CPU);
6421 }
6422 SLAB_ATTR_RO(cpu_slabs);
6423
6424 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
6425 {
6426         return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
6427 }
6428 SLAB_ATTR_RO(objects_partial);
6429
6430 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
6431 {
6432         int objects = 0;
6433         int slabs = 0;
6434         int cpu __maybe_unused;
6435         int len = 0;
6436
6437 #ifdef CONFIG_SLUB_CPU_PARTIAL
6438         for_each_online_cpu(cpu) {
6439                 struct slab *slab;
6440
6441                 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
6442
6443                 if (slab)
6444                         slabs += data_race(slab->slabs);
6445         }
6446 #endif
6447
6448         /* Approximate half-full slabs, see slub_set_cpu_partial() */
6449         objects = (slabs * oo_objects(s->oo)) / 2;
6450         len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
6451
6452 #ifdef CONFIG_SLUB_CPU_PARTIAL
6453         for_each_online_cpu(cpu) {
6454                 struct slab *slab;
6455
6456                 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
6457                 if (slab) {
6458                         slabs = data_race(slab->slabs);
6459                         objects = (slabs * oo_objects(s->oo)) / 2;
6460                         len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
6461                                              cpu, objects, slabs);
6462                 }
6463         }
6464 #endif
6465         len += sysfs_emit_at(buf, len, "\n");
6466
6467         return len;
6468 }
6469 SLAB_ATTR_RO(slabs_cpu_partial);
6470
6471 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
6472 {
6473         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
6474 }
6475 SLAB_ATTR_RO(reclaim_account);
6476
6477 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
6478 {
6479         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
6480 }
6481 SLAB_ATTR_RO(hwcache_align);
6482
6483 #ifdef CONFIG_ZONE_DMA
6484 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
6485 {
6486         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
6487 }
6488 SLAB_ATTR_RO(cache_dma);
6489 #endif
6490
6491 #ifdef CONFIG_HARDENED_USERCOPY
6492 static ssize_t usersize_show(struct kmem_cache *s, char *buf)
6493 {
6494         return sysfs_emit(buf, "%u\n", s->usersize);
6495 }
6496 SLAB_ATTR_RO(usersize);
6497 #endif
6498
6499 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
6500 {
6501         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
6502 }
6503 SLAB_ATTR_RO(destroy_by_rcu);
6504
6505 #ifdef CONFIG_SLUB_DEBUG
6506 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
6507 {
6508         return show_slab_objects(s, buf, SO_ALL);
6509 }
6510 SLAB_ATTR_RO(slabs);
6511
6512 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
6513 {
6514         return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
6515 }
6516 SLAB_ATTR_RO(total_objects);
6517
6518 static ssize_t objects_show(struct kmem_cache *s, char *buf)
6519 {
6520         return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
6521 }
6522 SLAB_ATTR_RO(objects);
6523
6524 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
6525 {
6526         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
6527 }
6528 SLAB_ATTR_RO(sanity_checks);
6529
6530 static ssize_t trace_show(struct kmem_cache *s, char *buf)
6531 {
6532         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
6533 }
6534 SLAB_ATTR_RO(trace);
6535
6536 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
6537 {
6538         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
6539 }
6540
6541 SLAB_ATTR_RO(red_zone);
6542
6543 static ssize_t poison_show(struct kmem_cache *s, char *buf)
6544 {
6545         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
6546 }
6547
6548 SLAB_ATTR_RO(poison);
6549
6550 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
6551 {
6552         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
6553 }
6554
6555 SLAB_ATTR_RO(store_user);
6556
6557 static ssize_t validate_show(struct kmem_cache *s, char *buf)
6558 {
6559         return 0;
6560 }
6561
6562 static ssize_t validate_store(struct kmem_cache *s,
6563                         const char *buf, size_t length)
6564 {
6565         int ret = -EINVAL;
6566
6567         if (buf[0] == '1' && kmem_cache_debug(s)) {
6568                 ret = validate_slab_cache(s);
6569                 if (ret >= 0)
6570                         ret = length;
6571         }
6572         return ret;
6573 }
6574 SLAB_ATTR(validate);
6575
6576 #endif /* CONFIG_SLUB_DEBUG */
6577
6578 #ifdef CONFIG_FAILSLAB
6579 static ssize_t failslab_show(struct kmem_cache *s, char *buf)
6580 {
6581         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
6582 }
6583
6584 static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
6585                                 size_t length)
6586 {
6587         if (s->refcount > 1)
6588                 return -EINVAL;
6589
6590         if (buf[0] == '1')
6591                 WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
6592         else
6593                 WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
6594
6595         return length;
6596 }
6597 SLAB_ATTR(failslab);
6598 #endif
6599
6600 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
6601 {
6602         return 0;
6603 }
6604
6605 static ssize_t shrink_store(struct kmem_cache *s,
6606                         const char *buf, size_t length)
6607 {
6608         if (buf[0] == '1')
6609                 kmem_cache_shrink(s);
6610         else
6611                 return -EINVAL;
6612         return length;
6613 }
6614 SLAB_ATTR(shrink);
6615
6616 #ifdef CONFIG_NUMA
6617 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
6618 {
6619         return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
6620 }
6621
6622 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
6623                                 const char *buf, size_t length)
6624 {
6625         unsigned int ratio;
6626         int err;
6627
6628         err = kstrtouint(buf, 10, &ratio);
6629         if (err)
6630                 return err;
6631         if (ratio > 100)
6632                 return -ERANGE;
6633
6634         s->remote_node_defrag_ratio = ratio * 10;
6635
6636         return length;
6637 }
6638 SLAB_ATTR(remote_node_defrag_ratio);
6639 #endif
6640
6641 #ifdef CONFIG_SLUB_STATS
6642 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
6643 {
6644         unsigned long sum  = 0;
6645         int cpu;
6646         int len = 0;
6647         int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
6648
6649         if (!data)
6650                 return -ENOMEM;
6651
6652         for_each_online_cpu(cpu) {
6653                 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
6654
6655                 data[cpu] = x;
6656                 sum += x;
6657         }
6658
6659         len += sysfs_emit_at(buf, len, "%lu", sum);
6660
6661 #ifdef CONFIG_SMP
6662         for_each_online_cpu(cpu) {
6663                 if (data[cpu])
6664                         len += sysfs_emit_at(buf, len, " C%d=%u",
6665                                              cpu, data[cpu]);
6666         }
6667 #endif
6668         kfree(data);
6669         len += sysfs_emit_at(buf, len, "\n");
6670
6671         return len;
6672 }
6673
6674 static void clear_stat(struct kmem_cache *s, enum stat_item si)
6675 {
6676         int cpu;
6677
6678         for_each_online_cpu(cpu)
6679                 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
6680 }
6681
6682 #define STAT_ATTR(si, text)                                     \
6683 static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
6684 {                                                               \
6685         return show_stat(s, buf, si);                           \
6686 }                                                               \
6687 static ssize_t text##_store(struct kmem_cache *s,               \
6688                                 const char *buf, size_t length) \
6689 {                                                               \
6690         if (buf[0] != '0')                                      \
6691                 return -EINVAL;                                 \
6692         clear_stat(s, si);                                      \
6693         return length;                                          \
6694 }                                                               \
6695 SLAB_ATTR(text);                                                \
6696
6697 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
6698 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
6699 STAT_ATTR(FREE_FASTPATH, free_fastpath);
6700 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
6701 STAT_ATTR(FREE_FROZEN, free_frozen);
6702 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
6703 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
6704 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
6705 STAT_ATTR(ALLOC_SLAB, alloc_slab);
6706 STAT_ATTR(ALLOC_REFILL, alloc_refill);
6707 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
6708 STAT_ATTR(FREE_SLAB, free_slab);
6709 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
6710 STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
6711 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
6712 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
6713 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
6714 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
6715 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
6716 STAT_ATTR(ORDER_FALLBACK, order_fallback);
6717 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
6718 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
6719 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
6720 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
6721 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
6722 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
6723 #endif  /* CONFIG_SLUB_STATS */
6724
6725 #ifdef CONFIG_KFENCE
6726 static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
6727 {
6728         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
6729 }
6730
6731 static ssize_t skip_kfence_store(struct kmem_cache *s,
6732                         const char *buf, size_t length)
6733 {
6734         int ret = length;
6735
6736         if (buf[0] == '0')
6737                 s->flags &= ~SLAB_SKIP_KFENCE;
6738         else if (buf[0] == '1')
6739                 s->flags |= SLAB_SKIP_KFENCE;
6740         else
6741                 ret = -EINVAL;
6742
6743         return ret;
6744 }
6745 SLAB_ATTR(skip_kfence);
6746 #endif
6747
6748 static struct attribute *slab_attrs[] = {
6749         &slab_size_attr.attr,
6750         &object_size_attr.attr,
6751         &objs_per_slab_attr.attr,
6752         &order_attr.attr,
6753         &min_partial_attr.attr,
6754         &cpu_partial_attr.attr,
6755         &objects_partial_attr.attr,
6756         &partial_attr.attr,
6757         &cpu_slabs_attr.attr,
6758         &ctor_attr.attr,
6759         &aliases_attr.attr,
6760         &align_attr.attr,
6761         &hwcache_align_attr.attr,
6762         &reclaim_account_attr.attr,
6763         &destroy_by_rcu_attr.attr,
6764         &shrink_attr.attr,
6765         &slabs_cpu_partial_attr.attr,
6766 #ifdef CONFIG_SLUB_DEBUG
6767         &total_objects_attr.attr,
6768         &objects_attr.attr,
6769         &slabs_attr.attr,
6770         &sanity_checks_attr.attr,
6771         &trace_attr.attr,
6772         &red_zone_attr.attr,
6773         &poison_attr.attr,
6774         &store_user_attr.attr,
6775         &validate_attr.attr,
6776 #endif
6777 #ifdef CONFIG_ZONE_DMA
6778         &cache_dma_attr.attr,
6779 #endif
6780 #ifdef CONFIG_NUMA
6781         &remote_node_defrag_ratio_attr.attr,
6782 #endif
6783 #ifdef CONFIG_SLUB_STATS
6784         &alloc_fastpath_attr.attr,
6785         &alloc_slowpath_attr.attr,
6786         &free_fastpath_attr.attr,
6787         &free_slowpath_attr.attr,
6788         &free_frozen_attr.attr,
6789         &free_add_partial_attr.attr,
6790         &free_remove_partial_attr.attr,
6791         &alloc_from_partial_attr.attr,
6792         &alloc_slab_attr.attr,
6793         &alloc_refill_attr.attr,
6794         &alloc_node_mismatch_attr.attr,
6795         &free_slab_attr.attr,
6796         &cpuslab_flush_attr.attr,
6797         &deactivate_full_attr.attr,
6798         &deactivate_empty_attr.attr,
6799         &deactivate_to_head_attr.attr,
6800         &deactivate_to_tail_attr.attr,
6801         &deactivate_remote_frees_attr.attr,
6802         &deactivate_bypass_attr.attr,
6803         &order_fallback_attr.attr,
6804         &cmpxchg_double_fail_attr.attr,
6805         &cmpxchg_double_cpu_fail_attr.attr,
6806         &cpu_partial_alloc_attr.attr,
6807         &cpu_partial_free_attr.attr,
6808         &cpu_partial_node_attr.attr,
6809         &cpu_partial_drain_attr.attr,
6810 #endif
6811 #ifdef CONFIG_FAILSLAB
6812         &failslab_attr.attr,
6813 #endif
6814 #ifdef CONFIG_HARDENED_USERCOPY
6815         &usersize_attr.attr,
6816 #endif
6817 #ifdef CONFIG_KFENCE
6818         &skip_kfence_attr.attr,
6819 #endif
6820
6821         NULL
6822 };
6823
6824 static const struct attribute_group slab_attr_group = {
6825         .attrs = slab_attrs,
6826 };
6827
6828 static ssize_t slab_attr_show(struct kobject *kobj,
6829                                 struct attribute *attr,
6830                                 char *buf)
6831 {
6832         struct slab_attribute *attribute;
6833         struct kmem_cache *s;
6834
6835         attribute = to_slab_attr(attr);
6836         s = to_slab(kobj);
6837
6838         if (!attribute->show)
6839                 return -EIO;
6840
6841         return attribute->show(s, buf);
6842 }
6843
6844 static ssize_t slab_attr_store(struct kobject *kobj,
6845                                 struct attribute *attr,
6846                                 const char *buf, size_t len)
6847 {
6848         struct slab_attribute *attribute;
6849         struct kmem_cache *s;
6850
6851         attribute = to_slab_attr(attr);
6852         s = to_slab(kobj);
6853
6854         if (!attribute->store)
6855                 return -EIO;
6856
6857         return attribute->store(s, buf, len);
6858 }
6859
6860 static void kmem_cache_release(struct kobject *k)
6861 {
6862         slab_kmem_cache_release(to_slab(k));
6863 }
6864
6865 static const struct sysfs_ops slab_sysfs_ops = {
6866         .show = slab_attr_show,
6867         .store = slab_attr_store,
6868 };
6869
6870 static const struct kobj_type slab_ktype = {
6871         .sysfs_ops = &slab_sysfs_ops,
6872         .release = kmem_cache_release,
6873 };
6874
6875 static struct kset *slab_kset;
6876
6877 static inline struct kset *cache_kset(struct kmem_cache *s)
6878 {
6879         return slab_kset;
6880 }
6881
6882 #define ID_STR_LENGTH 32
6883
6884 /* Create a unique string id for a slab cache:
6885  *
6886  * Format       :[flags-]size
6887  */
6888 static char *create_unique_id(struct kmem_cache *s)
6889 {
6890         char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
6891         char *p = name;
6892
6893         if (!name)
6894                 return ERR_PTR(-ENOMEM);
6895
6896         *p++ = ':';
6897         /*
6898          * First flags affecting slabcache operations. We will only
6899          * get here for aliasable slabs so we do not need to support
6900          * too many flags. The flags here must cover all flags that
6901          * are matched during merging to guarantee that the id is
6902          * unique.
6903          */
6904         if (s->flags & SLAB_CACHE_DMA)
6905                 *p++ = 'd';
6906         if (s->flags & SLAB_CACHE_DMA32)
6907                 *p++ = 'D';
6908         if (s->flags & SLAB_RECLAIM_ACCOUNT)
6909                 *p++ = 'a';
6910         if (s->flags & SLAB_CONSISTENCY_CHECKS)
6911                 *p++ = 'F';
6912         if (s->flags & SLAB_ACCOUNT)
6913                 *p++ = 'A';
6914         if (p != name + 1)
6915                 *p++ = '-';
6916         p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
6917
6918         if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
6919                 kfree(name);
6920                 return ERR_PTR(-EINVAL);
6921         }
6922         kmsan_unpoison_memory(name, p - name);
6923         return name;
6924 }
6925
6926 static int sysfs_slab_add(struct kmem_cache *s)
6927 {
6928         int err;
6929         const char *name;
6930         struct kset *kset = cache_kset(s);
6931         int unmergeable = slab_unmergeable(s);
6932
6933         if (!unmergeable && disable_higher_order_debug &&
6934                         (slub_debug & DEBUG_METADATA_FLAGS))
6935                 unmergeable = 1;
6936
6937         if (unmergeable) {
6938                 /*
6939                  * Slabcache can never be merged so we can use the name proper.
6940                  * This is typically the case for debug situations. In that
6941                  * case we can catch duplicate names easily.
6942                  */
6943                 sysfs_remove_link(&slab_kset->kobj, s->name);
6944                 name = s->name;
6945         } else {
6946                 /*
6947                  * Create a unique name for the slab as a target
6948                  * for the symlinks.
6949                  */
6950                 name = create_unique_id(s);
6951                 if (IS_ERR(name))
6952                         return PTR_ERR(name);
6953         }
6954
6955         s->kobj.kset = kset;
6956         err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
6957         if (err)
6958                 goto out;
6959
6960         err = sysfs_create_group(&s->kobj, &slab_attr_group);
6961         if (err)
6962                 goto out_del_kobj;
6963
6964         if (!unmergeable) {
6965                 /* Setup first alias */
6966                 sysfs_slab_alias(s, s->name);
6967         }
6968 out:
6969         if (!unmergeable)
6970                 kfree(name);
6971         return err;
6972 out_del_kobj:
6973         kobject_del(&s->kobj);
6974         goto out;
6975 }
6976
6977 void sysfs_slab_unlink(struct kmem_cache *s)
6978 {
6979         kobject_del(&s->kobj);
6980 }
6981
6982 void sysfs_slab_release(struct kmem_cache *s)
6983 {
6984         kobject_put(&s->kobj);
6985 }
6986
6987 /*
6988  * Need to buffer aliases during bootup until sysfs becomes
6989  * available lest we lose that information.
6990  */
6991 struct saved_alias {
6992         struct kmem_cache *s;
6993         const char *name;
6994         struct saved_alias *next;
6995 };
6996
6997 static struct saved_alias *alias_list;
6998
6999 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
7000 {
7001         struct saved_alias *al;
7002
7003         if (slab_state == FULL) {
7004                 /*
7005                  * If we have a leftover link then remove it.
7006                  */
7007                 sysfs_remove_link(&slab_kset->kobj, name);
7008                 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
7009         }
7010
7011         al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
7012         if (!al)
7013                 return -ENOMEM;
7014
7015         al->s = s;
7016         al->name = name;
7017         al->next = alias_list;
7018         alias_list = al;
7019         kmsan_unpoison_memory(al, sizeof(*al));
7020         return 0;
7021 }
7022
7023 static int __init slab_sysfs_init(void)
7024 {
7025         struct kmem_cache *s;
7026         int err;
7027
7028         mutex_lock(&slab_mutex);
7029
7030         slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
7031         if (!slab_kset) {
7032                 mutex_unlock(&slab_mutex);
7033                 pr_err("Cannot register slab subsystem.\n");
7034                 return -ENOMEM;
7035         }
7036
7037         slab_state = FULL;
7038
7039         list_for_each_entry(s, &slab_caches, list) {
7040                 err = sysfs_slab_add(s);
7041                 if (err)
7042                         pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
7043                                s->name);
7044         }
7045
7046         while (alias_list) {
7047                 struct saved_alias *al = alias_list;
7048
7049                 alias_list = alias_list->next;
7050                 err = sysfs_slab_alias(al->s, al->name);
7051                 if (err)
7052                         pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
7053                                al->name);
7054                 kfree(al);
7055         }
7056
7057         mutex_unlock(&slab_mutex);
7058         return 0;
7059 }
7060 late_initcall(slab_sysfs_init);
7061 #endif /* SLAB_SUPPORTS_SYSFS */
7062
7063 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
7064 static int slab_debugfs_show(struct seq_file *seq, void *v)
7065 {
7066         struct loc_track *t = seq->private;
7067         struct location *l;
7068         unsigned long idx;
7069
7070         idx = (unsigned long) t->idx;
7071         if (idx < t->count) {
7072                 l = &t->loc[idx];
7073
7074                 seq_printf(seq, "%7ld ", l->count);
7075
7076                 if (l->addr)
7077                         seq_printf(seq, "%pS", (void *)l->addr);
7078                 else
7079                         seq_puts(seq, "<not-available>");
7080
7081                 if (l->waste)
7082                         seq_printf(seq, " waste=%lu/%lu",
7083                                 l->count * l->waste, l->waste);
7084
7085                 if (l->sum_time != l->min_time) {
7086                         seq_printf(seq, " age=%ld/%llu/%ld",
7087                                 l->min_time, div_u64(l->sum_time, l->count),
7088                                 l->max_time);
7089                 } else
7090                         seq_printf(seq, " age=%ld", l->min_time);
7091
7092                 if (l->min_pid != l->max_pid)
7093                         seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
7094                 else
7095                         seq_printf(seq, " pid=%ld",
7096                                 l->min_pid);
7097
7098                 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
7099                         seq_printf(seq, " cpus=%*pbl",
7100                                  cpumask_pr_args(to_cpumask(l->cpus)));
7101
7102                 if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
7103                         seq_printf(seq, " nodes=%*pbl",
7104                                  nodemask_pr_args(&l->nodes));
7105
7106 #ifdef CONFIG_STACKDEPOT
7107                 {
7108                         depot_stack_handle_t handle;
7109                         unsigned long *entries;
7110                         unsigned int nr_entries, j;
7111
7112                         handle = READ_ONCE(l->handle);
7113                         if (handle) {
7114                                 nr_entries = stack_depot_fetch(handle, &entries);
7115                                 seq_puts(seq, "\n");
7116                                 for (j = 0; j < nr_entries; j++)
7117                                         seq_printf(seq, "        %pS\n", (void *)entries[j]);
7118                         }
7119                 }
7120 #endif
7121                 seq_puts(seq, "\n");
7122         }
7123
7124         if (!idx && !t->count)
7125                 seq_puts(seq, "No data\n");
7126
7127         return 0;
7128 }
7129
7130 static void slab_debugfs_stop(struct seq_file *seq, void *v)
7131 {
7132 }
7133
7134 static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
7135 {
7136         struct loc_track *t = seq->private;
7137
7138         t->idx = ++(*ppos);
7139         if (*ppos <= t->count)
7140                 return ppos;
7141
7142         return NULL;
7143 }
7144
7145 static int cmp_loc_by_count(const void *a, const void *b, const void *data)
7146 {
7147         struct location *loc1 = (struct location *)a;
7148         struct location *loc2 = (struct location *)b;
7149
7150         if (loc1->count > loc2->count)
7151                 return -1;
7152         else
7153                 return 1;
7154 }
7155
7156 static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
7157 {
7158         struct loc_track *t = seq->private;
7159
7160         t->idx = *ppos;
7161         return ppos;
7162 }
7163
7164 static const struct seq_operations slab_debugfs_sops = {
7165         .start  = slab_debugfs_start,
7166         .next   = slab_debugfs_next,
7167         .stop   = slab_debugfs_stop,
7168         .show   = slab_debugfs_show,
7169 };
7170
7171 static int slab_debug_trace_open(struct inode *inode, struct file *filep)
7172 {
7173
7174         struct kmem_cache_node *n;
7175         enum track_item alloc;
7176         int node;
7177         struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
7178                                                 sizeof(struct loc_track));
7179         struct kmem_cache *s = file_inode(filep)->i_private;
7180         unsigned long *obj_map;
7181
7182         if (!t)
7183                 return -ENOMEM;
7184
7185         obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
7186         if (!obj_map) {
7187                 seq_release_private(inode, filep);
7188                 return -ENOMEM;
7189         }
7190
7191         if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
7192                 alloc = TRACK_ALLOC;
7193         else
7194                 alloc = TRACK_FREE;
7195
7196         if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
7197                 bitmap_free(obj_map);
7198                 seq_release_private(inode, filep);
7199                 return -ENOMEM;
7200         }
7201
7202         for_each_kmem_cache_node(s, node, n) {
7203                 unsigned long flags;
7204                 struct slab *slab;
7205
7206                 if (!node_nr_slabs(n))
7207                         continue;
7208
7209                 spin_lock_irqsave(&n->list_lock, flags);
7210                 list_for_each_entry(slab, &n->partial, slab_list)
7211                         process_slab(t, s, slab, alloc, obj_map);
7212                 list_for_each_entry(slab, &n->full, slab_list)
7213                         process_slab(t, s, slab, alloc, obj_map);
7214                 spin_unlock_irqrestore(&n->list_lock, flags);
7215         }
7216
7217         /* Sort locations by count */
7218         sort_r(t->loc, t->count, sizeof(struct location),
7219                 cmp_loc_by_count, NULL, NULL);
7220
7221         bitmap_free(obj_map);
7222         return 0;
7223 }
7224
7225 static int slab_debug_trace_release(struct inode *inode, struct file *file)
7226 {
7227         struct seq_file *seq = file->private_data;
7228         struct loc_track *t = seq->private;
7229
7230         free_loc_track(t);
7231         return seq_release_private(inode, file);
7232 }
7233
7234 static const struct file_operations slab_debugfs_fops = {
7235         .open    = slab_debug_trace_open,
7236         .read    = seq_read,
7237         .llseek  = seq_lseek,
7238         .release = slab_debug_trace_release,
7239 };
7240
7241 static void debugfs_slab_add(struct kmem_cache *s)
7242 {
7243         struct dentry *slab_cache_dir;
7244
7245         if (unlikely(!slab_debugfs_root))
7246                 return;
7247
7248         slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
7249
7250         debugfs_create_file("alloc_traces", 0400,
7251                 slab_cache_dir, s, &slab_debugfs_fops);
7252
7253         debugfs_create_file("free_traces", 0400,
7254                 slab_cache_dir, s, &slab_debugfs_fops);
7255 }
7256
7257 void debugfs_slab_release(struct kmem_cache *s)
7258 {
7259         debugfs_lookup_and_remove(s->name, slab_debugfs_root);
7260 }
7261
7262 static int __init slab_debugfs_init(void)
7263 {
7264         struct kmem_cache *s;
7265
7266         slab_debugfs_root = debugfs_create_dir("slab", NULL);
7267
7268         list_for_each_entry(s, &slab_caches, list)
7269                 if (s->flags & SLAB_STORE_USER)
7270                         debugfs_slab_add(s);
7271
7272         return 0;
7273
7274 }
7275 __initcall(slab_debugfs_init);
7276 #endif
7277 /*
7278  * The /proc/slabinfo ABI
7279  */
7280 #ifdef CONFIG_SLUB_DEBUG
7281 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
7282 {
7283         unsigned long nr_slabs = 0;
7284         unsigned long nr_objs = 0;
7285         unsigned long nr_free = 0;
7286         int node;
7287         struct kmem_cache_node *n;
7288
7289         for_each_kmem_cache_node(s, node, n) {
7290                 nr_slabs += node_nr_slabs(n);
7291                 nr_objs += node_nr_objs(n);
7292                 nr_free += count_partial_free_approx(n);
7293         }
7294
7295         sinfo->active_objs = nr_objs - nr_free;
7296         sinfo->num_objs = nr_objs;
7297         sinfo->active_slabs = nr_slabs;
7298         sinfo->num_slabs = nr_slabs;
7299         sinfo->objects_per_slab = oo_objects(s->oo);
7300         sinfo->cache_order = oo_order(s->oo);
7301 }
7302 #endif /* CONFIG_SLUB_DEBUG */