diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2008-11-12 15:11:18 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2008-11-12 15:11:18 +1100 |
commit | 208aa3cf91e54e34473796e018e071edab86d70d (patch) | |
tree | b680a618b4102342163be9b767b7f2ff5268f8c7 | |
parent | 59b465a30f17460eb3809a6845bd7c290bd894df (diff) | |
parent | 00035957b2863e8d4bbb3c2b5f632c2a4fb5a0a1 (diff) |
Merge commit 'slab/for-next'
36 files changed, 1955 insertions, 192 deletions
diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace new file mode 100644 index 000000000000..5e6a92a02d85 --- /dev/null +++ b/Documentation/ABI/testing/debugfs-kmemtrace @@ -0,0 +1,71 @@ +What: /sys/kernel/debug/kmemtrace/ +Date: July 2008 +Contact: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> +Description: + +In kmemtrace-enabled kernels, the following files are created: + +/sys/kernel/debug/kmemtrace/ + cpu<n> (0400) Per-CPU tracing data, see below. (binary) + total_overruns (0400) Total number of bytes which were dropped from + cpu<n> files because of full buffer condition, + non-binary. (text) + abi_version (0400) Kernel's kmemtrace ABI version. (text) + +Each per-CPU file should be read according to the relay interface. That is, +the reader should set affinity to that specific CPU and, as currently done by +the userspace application (though there are other methods), use poll() with +an infinite timeout before every read(). Otherwise, erroneous data may be +read. The binary data has the following _core_ format: + + Event ID (1 byte) Unsigned integer, one of: + 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) + 1 - represents a freeing of previously allocated memory + (KMEMTRACE_EVENT_FREE) + Type ID (1 byte) Unsigned integer, one of: + 0 - this is a kmalloc() / kfree() + 1 - this is a kmem_cache_alloc() / kmem_cache_free() + 2 - this is a __get_free_pages() et al. + Event size (2 bytes) Unsigned integer representing the + size of this event. Used to extend + kmemtrace. Discard the bytes you + don't know about. + Sequence number (4 bytes) Signed integer used to reorder data + logged on SMP machines. Wraparound + must be taken into account, although + it is unlikely. + Caller address (8 bytes) Return address to the caller. + Pointer to mem (8 bytes) Pointer to target memory area. Can be + NULL, but not all such calls might be + recorded. + +In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: + + Requested bytes (8 bytes) Total number of requested bytes, + unsigned, must not be zero. + Allocated bytes (8 bytes) Total number of actually allocated + bytes, unsigned, must not be lower + than requested bytes. + Requested flags (4 bytes) GFP flags supplied by the caller. + Target CPU (4 bytes) Signed integer, valid for event id 1. + If equal to -1, target CPU is the same + as origin CPU, but the reverse might + not be true. + +The data is made available in the same endianness the machine has. + +Other event ids and type ids may be defined and added. Other fields may be +added by increasing event size, but see below for details. +Every modification to the ABI, including new id definitions, are followed +by bumping the ABI version by one. + +Adding new data to the packet (features) is done at the end of the mandatory +data: + Feature size (2 byte) + Feature ID (1 byte) + Feature data (Feature size - 3 bytes) + + +Users: + kmemtrace-user - git://repo.or.cz/kmemtrace-user.git + diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1c98d8371476..b92cec9805fb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -49,6 +49,7 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. + KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1030,6 +1031,15 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. + kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } + Controls whether kmemtrace is enabled + at boot-time. + + kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of + subbufs kmemtrace's relay channel has. Set this + higher than default (KMEMTRACE_N_SUBBUFS in code) if + you experience buffer overruns. + movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter is similar to kernelcore except it specifies the amount of memory used for migratable allocations. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index d79eeda7a699..5e7329a1abcc 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - nr_hugepages - nr_overcommit_hugepages +- slab_defrag_limit ============================================================== @@ -347,3 +348,14 @@ Change the maximum size of the hugepage pool. The maximum is nr_hugepages + nr_overcommit_hugepages. See Documentation/vm/hugetlbpage.txt + +============================================================== + +slab_defrag_limit + +Determines the frequency of calls from reclaim into slab defragmentation. +Slab defrag reclaims objects from sparsely populates slab pages. +The default is 1000. Increase if slab defragmentation occurs +too frequently. Decrease if more slab defragmentation passes +are needed. The slabinfo tool can report on the frequency of the callbacks. + diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt new file mode 100644 index 000000000000..a956d9b7f943 --- /dev/null +++ b/Documentation/vm/kmemtrace.txt @@ -0,0 +1,126 @@ + kmemtrace - Kernel Memory Tracer + + by Eduard - Gabriel Munteanu + <eduard.munteanu@linux360.ro> + +I. Introduction +=============== + +kmemtrace helps kernel developers figure out two things: +1) how different allocators (SLAB, SLUB etc.) perform +2) how kernel code allocates memory and how much + +To do this, we trace every allocation and export information to the userspace +through the relay interface. We export things such as the number of requested +bytes, the number of bytes actually allocated (i.e. including internal +fragmentation), whether this is a slab allocation or a plain kmalloc() and so +on. + +The actual analysis is performed by a userspace tool (see section III for +details on where to get it from). It logs the data exported by the kernel, +processes it and (as of writing this) can provide the following information: +- the total amount of memory allocated and fragmentation per call-site +- the amount of memory allocated and fragmentation per allocation +- total memory allocated and fragmentation in the collected dataset +- number of cross-CPU allocation and frees (makes sense in NUMA environments) + +Moreover, it can potentially find inconsistent and erroneous behavior in +kernel code, such as using slab free functions on kmalloc'ed memory or +allocating less memory than requested (but not truly failed allocations). + +kmemtrace also makes provisions for tracing on some arch and analysing the +data on another. + +II. Design and goals +==================== + +kmemtrace was designed to handle rather large amounts of data. Thus, it uses +the relay interface to export whatever is logged to userspace, which then +stores it. Analysis and reporting is done asynchronously, that is, after the +data is collected and stored. By design, it allows one to log and analyse +on different machines and different arches. + +As of writing this, the ABI is not considered stable, though it might not +change much. However, no guarantees are made about compatibility yet. When +deemed stable, the ABI should still allow easy extension while maintaining +backward compatibility. This is described further in Documentation/ABI. + +Summary of design goals: + - allow logging and analysis to be done across different machines + - be fast and anticipate usage in high-load environments (*) + - be reasonably extensible + - make it possible for GNU/Linux distributions to have kmemtrace + included in their repositories + +(*) - one of the reasons Pekka Enberg's original userspace data analysis + tool's code was rewritten from Perl to C (although this is more than a + simple conversion) + + +III. Quick usage guide +====================== + +1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable +CONFIG_KMEMTRACE). + +2) Get the userspace tool and build it: +$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository +$ cd kmemtrace-user/ +$ ./autogen.sh +$ ./configure +$ make + +3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the +'single' runlevel (so that relay buffers don't fill up easily), and run +kmemtrace: +# '$' does not mean user, but root here. +$ mount -t debugfs none /sys/kernel/debug +$ mount -t proc none /proc +$ cd path/to/kmemtrace-user/ +$ ./kmemtraced +Wait a bit, then stop it with CTRL+C. +$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't + # overrun, should + # be zero. +$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to + check its correctness] +$ ./kmemtrace-report + +Now you should have a nice and short summary of how the allocator performs. + +IV. FAQ and known issues +======================== + +Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix +this? Should I worry? +A: If it's non-zero, this affects kmemtrace's accuracy, depending on how +large the number is. You can fix it by supplying a higher +'kmemtrace.subbufs=N' kernel parameter. +--- + +Q: kmemtrace_check reports errors, how do I fix this? Should I worry? +A: This is a bug and should be reported. It can occur for a variety of +reasons: + - possible bugs in relay code + - possible misuse of relay by kmemtrace + - timestamps being collected unorderly +Or you may fix it yourself and send us a patch. +--- + +Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? +A: This is a known issue and I'm working on it. These might be true errors +in kernel code, which may have inconsistent behavior (e.g. allocating memory +with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed +out this behavior may work with SLAB, but may fail with other allocators. + +It may also be due to lack of tracing in some unusual allocator functions. + +We don't want bug reports regarding this issue yet. +--- + +V. See also +=========== + +Documentation/kernel-parameters.txt +Documentation/ABI/testing/debugfs-kmemtrace + diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index df3227605d59..75b56e8fc651 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -31,6 +31,8 @@ struct slabinfo { int hwcache_align, object_size, objs_per_slab; int sanity_checks, slab_size, store_user, trace; int order, poison, reclaim_account, red_zone; + int defrag, ctor; + int defrag_ratio, remote_node_defrag_ratio; unsigned long partial, objects, slabs, objects_partial, objects_total; unsigned long alloc_fastpath, alloc_slowpath; unsigned long free_fastpath, free_slowpath; @@ -39,6 +41,9 @@ struct slabinfo { unsigned long cpuslab_flush, deactivate_full, deactivate_empty; unsigned long deactivate_to_head, deactivate_to_tail; unsigned long deactivate_remote_frees, order_fallback; + unsigned long shrink_calls, shrink_attempt_defrag, shrink_empty_slab; + unsigned long shrink_slab_skipped, shrink_slab_reclaimed; + unsigned long shrink_object_reclaim_failed; int numa[MAX_NODES]; int numa_partial[MAX_NODES]; } slabinfo[MAX_SLABS]; @@ -64,6 +69,8 @@ int show_slab = 0; int skip_zero = 1; int show_numa = 0; int show_track = 0; +int show_defrag = 0; +int show_ctor = 0; int show_first_alias = 0; int validate = 0; int shrink = 0; @@ -75,6 +82,7 @@ int sort_active = 0; int set_debug = 0; int show_ops = 0; int show_activity = 0; +int show_defragcount = 0; /* Debug options */ int sanity = 0; @@ -100,20 +108,23 @@ void fatal(const char *x, ...) void usage(void) { printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" - "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" + "slabinfo [-aCdDefFhnpvtsz] [-d debugopts] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" "-d<options>|--debug=<options> Set/Clear Debug options\n" + "-C|--ctor Show slabs with ctors\n" "-D|--display-active Switch line format to activity\n" "-e|--empty Show empty slabs\n" "-f|--first-alias Show first alias\n" + "-F|--defrag Show defragmentable caches\n" + "-G|--display-defrag Display defrag counters\n" "-h|--help Show usage information\n" "-i|--inverted Inverted list\n" "-l|--slabs Show slabs\n" "-n|--numa Show NUMA information\n" - "-o|--ops Show kmem_cache_ops\n" + "-o|--ops Show kmem_cache_ops\n" "-s|--shrink Shrink slabs\n" - "-r|--report Detailed report on single slabs\n" + "-r|--report Detailed report on single slabs\n" "-S|--Size Sort by size\n" "-t|--tracking Show alloc/free information\n" "-T|--Totals Show summary information\n" @@ -294,9 +305,11 @@ void first_line(void) { if (show_activity) printf("Name Objects Alloc Free %%Fast Fallb O\n"); + else if (show_defragcount) + printf("Name Objects DefragRQ Slabs Success Empty Skipped Failed\n"); else printf("Name Objects Objsize Space " - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); + "Slabs/Part/Cpu O/S O %%Ra %%Ef Flg\n"); } /* @@ -345,7 +358,7 @@ void slab_numa(struct slabinfo *s, int mode) return; if (!line) { - printf("\n%-21s:", mode ? "NUMA nodes" : "Slab"); + printf("\n%-21s: Rto ", mode ? "NUMA nodes" : "Slab"); for(node = 0; node <= highest_node; node++) printf(" %4d", node); printf("\n----------------------"); @@ -354,6 +367,7 @@ void slab_numa(struct slabinfo *s, int mode) printf("\n"); } printf("%-21s ", mode ? "All slabs" : s->name); + printf("%3d ", s->remote_node_defrag_ratio); for(node = 0; node <= highest_node; node++) { char b[20]; @@ -459,22 +473,28 @@ void slab_stats(struct slabinfo *s) printf("Total %8lu %8lu\n\n", total_alloc, total_free); - if (s->cpuslab_flush) - printf("Flushes %8lu\n", s->cpuslab_flush); - - if (s->alloc_refill) - printf("Refill %8lu\n", s->alloc_refill); + if (s->cpuslab_flush || s->alloc_refill) + printf("CPU Slab : Flushes=%lu Refills=%lu\n", + s->cpuslab_flush, s->alloc_refill); total = s->deactivate_full + s->deactivate_empty + s->deactivate_to_head + s->deactivate_to_tail; if (total) - printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) " + printf("Deactivate: Full=%lu(%lu%%) Empty=%lu(%lu%%) " "ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n", s->deactivate_full, (s->deactivate_full * 100) / total, s->deactivate_empty, (s->deactivate_empty * 100) / total, s->deactivate_to_head, (s->deactivate_to_head * 100) / total, s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); + + if (s->shrink_calls) + printf("Shrink : Calls=%lu Attempts=%lu Empty=%lu Successful=%lu\n", + s->shrink_calls, s->shrink_attempt_defrag, + s->shrink_empty_slab, s->shrink_slab_reclaimed); + if (s->shrink_slab_skipped || s->shrink_object_reclaim_failed) + printf("Defrag : Slabs skipped=%lu Object reclaim failed=%lu\n", + s->shrink_slab_skipped, s->shrink_object_reclaim_failed); } void report(struct slabinfo *s) @@ -492,6 +512,8 @@ void report(struct slabinfo *s) printf("** Slabs are destroyed via RCU\n"); if (s->reclaim_account) printf("** Reclaim accounting active\n"); + if (s->defrag) + printf("** Defragmentation at %d%%\n", s->defrag_ratio); printf("\nSizes (bytes) Slabs Debug Memory\n"); printf("------------------------------------------------------------------------\n"); @@ -539,6 +561,12 @@ void slabcache(struct slabinfo *s) if (show_empty && s->slabs) return; + if (show_defrag && !s->defrag) + return; + + if (show_ctor && !s->ctor) + return; + store_size(size_str, slab_size(s)); snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, s->partial, s->cpu_slabs); @@ -550,6 +578,10 @@ void slabcache(struct slabinfo *s) *p++ = '*'; if (s->cache_dma) *p++ = 'd'; + if (s->defrag) + *p++ = 'F'; + if (s->ctor) + *p++ = 'C'; if (s->hwcache_align) *p++ = 'A'; if (s->poison) @@ -579,12 +611,18 @@ void slabcache(struct slabinfo *s) total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, total_free ? (s->free_fastpath * 100 / total_free) : 0, s->order_fallback, s->order); - } + } else + if (show_defragcount) + printf("%-21s %8ld %7lu %7lu %7lu %7lu %7lu %7lu\n", + s->name, s->objects, s->shrink_calls, s->shrink_attempt_defrag, + s->shrink_slab_reclaimed, s->shrink_empty_slab, + s->shrink_slab_skipped, s->shrink_object_reclaim_failed); else printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", s->name, s->objects, s->object_size, size_str, dist_str, s->objs_per_slab, s->order, - s->slabs ? (s->partial * 100) / s->slabs : 100, + s->slabs ? (s->partial * 100) / + (s->slabs * s->objs_per_slab) : 100, s->slabs ? (s->objects * s->object_size * 100) / (s->slabs * (page_size << s->order)) : 100, flags); @@ -1190,7 +1228,24 @@ void read_slab_dir(void) slab->deactivate_to_tail = get_obj("deactivate_to_tail"); slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); slab->order_fallback = get_obj("order_fallback"); + slab->shrink_calls = get_obj("shrink_calls"); + slab->shrink_attempt_defrag = get_obj("shrink_attempt_defrag"); + slab->shrink_empty_slab = get_obj("shrink_empty_slab"); + slab->shrink_slab_skipped = get_obj("shrink_slab_skipped"); + slab->shrink_slab_reclaimed = get_obj("shrink_slab_reclaimed"); + slab->shrink_object_reclaim_failed = + get_obj("shrink_object_reclaim_failed"); + slab->defrag_ratio = get_obj("defrag_ratio"); + slab->remote_node_defrag_ratio = + get_obj("remote_node_defrag_ratio"); chdir(".."); + if (read_slab_obj(slab, "ops")) { + if (strstr(buffer, "ctor :")) + slab->ctor = 1; + if (strstr(buffer, "kick :")) + slab->defrag = 1; + } + if (slab->name[0] == ':') alias_targets++; slab++; @@ -1241,10 +1296,13 @@ void output_slabs(void) struct option opts[] = { { "aliases", 0, NULL, 'a' }, { "activity", 0, NULL, 'A' }, + { "ctor", 0, NULL, 'C' }, { "debug", 2, NULL, 'd' }, { "display-activity", 0, NULL, 'D' }, + { "display-defrag", 0, NULL, 'G' }, { "empty", 0, NULL, 'e' }, { "first-alias", 0, NULL, 'f' }, + { "defrag", 0, NULL, 'F' }, { "help", 0, NULL, 'h' }, { "inverted", 0, NULL, 'i'}, { "numa", 0, NULL, 'n' }, @@ -1267,7 +1325,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS", + while ((c = getopt_long(argc, argv, "aACd::DefFGhil1noprstvzTS", opts, NULL)) != -1) switch (c) { case '1': @@ -1293,6 +1351,9 @@ int main(int argc, char *argv[]) case 'f': show_first_alias = 1; break; + case 'G': + show_defragcount = 1; + break; case 'h': usage(); return 0; @@ -1323,6 +1384,12 @@ int main(int argc, char *argv[]) case 'z': skip_zero = 0; break; + case 'C': + show_ctor = 1; + break; + case 'F': + show_defrag = 1; + break; case 'T': show_totals = 1; break; diff --git a/MAINTAINERS b/MAINTAINERS index 569e6000c7e2..2bd484e1581c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2560,6 +2560,12 @@ M: jason.wessel@windriver.com L: kgdb-bugreport@lists.sourceforge.net S: Maintained +KMEMTRACE +P: Eduard - Gabriel Munteanu +M: eduard.munteanu@linux360.ro +L: linux-kernel@vger.kernel.org +S: Maintained + KPROBES P: Ananth N Mavinakayanahalli M: ananth@in.ibm.com diff --git a/fs/buffer.c b/fs/buffer.c index 6569fda5cfed..706817a3173d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3318,6 +3318,107 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +/* + * Writeback a page to clean the dirty state + */ +static void trigger_write(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int rc; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 0 + }; + + if (PageWriteback(page)) + goto unlock; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + goto unlock; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + goto unlock; + + rc = mapping->a_ops->writepage(page, &wbc); + if (rc < 0) + /* I/O Error writing */ + return; + + if (rc == AOP_WRITEPAGE_ACTIVATE) +unlock: unlock_page(page); +} + +/* + * Get references on buffers. + * + * We obtain references on the page that uses the buffer. v[i] will point to + * the corresponding page after get_buffers() is through. + * + * We are safe from the underlying page being removed simply by doing + * a get_page_unless_zero. The buffer head removal may race at will. + * try_to_free_buffes will later take appropriate locks to remove the + * buffers if they are still there. + */ +static void *get_buffers(struct kmem_cache *s, int nr, void **v) +{ + struct page *page; + struct buffer_head *bh; + int i, j; + int n = 0; + + for (i = 0; i < nr; i++) { + bh = v[i]; + v[i] = NULL; + + page = bh->b_page; + + if (page && PagePrivate(page)) { + for (j = 0; j < n; j++) + if (page == v[j]) + continue; + } + + if (get_page_unless_zero(page)) + v[n++] = page; + } + return NULL; +} + +/* + * Despite its name: kick_buffers operates on a list of pointers to + * page structs that was set up by get_buffer(). + */ +static void kick_buffers(struct kmem_cache *s, int nr, void **v, + void *private) +{ + struct page *page; + int i; + + for (i = 0; i < nr; i++) { + page = v[i]; + + if (!page) + continue; + + if (trylock_page(page)) { + if (PageDirty(page)) + trigger_write(page); + else { + if (PagePrivate(page)) + try_to_free_buffers(page); + unlock_page(page); + } + } + put_page(page); + } +} + static void init_buffer_head(void *data) { @@ -3336,6 +3437,7 @@ void __init buffer_init(void) (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_MEM_SPREAD), init_buffer_head); + kmem_cache_setup_defrag(bh_cachep, get_buffers, kick_buffers); /* * Limit the bh occupancy to 10% of ZONE_NORMAL diff --git a/fs/dcache.c b/fs/dcache.c index a1d86c7f3e66..bb653fa5a1a4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include <linux/seqlock.h> #include <linux/swap.h> #include <linux/bootmem.h> +#include <linux/backing-dev.h> #include "internal.h" @@ -142,15 +143,6 @@ static void dentry_lru_add_tail(struct dentry *dentry) static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; - } -} - -static void dentry_lru_del_init(struct dentry *dentry) -{ - if (likely(!list_empty(&dentry->d_lru))) { list_del_init(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; @@ -173,7 +165,10 @@ static struct dentry *d_kill(struct dentry *dentry) list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ - /*drops the locks, at that point nobody can reach this dentry */ + /* + * drops the locks, at that point nobody (aside from defrag) + * can reach this dentry + */ dentry_iput(dentry); if (IS_ROOT(dentry)) parent = NULL; @@ -320,7 +315,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); return dentry; } @@ -436,7 +431,7 @@ static void prune_one_dentry(struct dentry * dentry) if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); @@ -496,7 +491,7 @@ restart: } while (!list_empty(&tmp)) { dentry = list_entry(tmp.prev, struct dentry, d_lru); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); spin_lock(&dentry->d_lock); /* * We found an inuse dentry which was not removed from @@ -625,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* detach this root from the system */ spin_lock(&dcache_lock); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); spin_unlock(&dcache_lock); @@ -639,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_lock(&dcache_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { - dentry_lru_del_init(loop); + dentry_lru_del(loop); __d_drop(loop); cond_resched_lock(&dcache_lock); } @@ -822,7 +817,7 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache @@ -904,6 +899,16 @@ static struct shrinker dcache_shrinker = { .seeks = DEFAULT_SEEKS, }; +static void dcache_ctor(void *p) +{ + struct dentry *dentry = p; + + spin_lock_init(&dentry->d_lock); + dentry->d_inode = NULL; + INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_alias); +} + /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate @@ -941,8 +946,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) atomic_set(&dentry->d_count, 1); dentry->d_flags = DCACHE_UNHASHED; - spin_lock_init(&dentry->d_lock); - dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; dentry->d_op = NULL; @@ -952,9 +955,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_cookie = NULL; #endif INIT_HLIST_NODE(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); - INIT_LIST_HEAD(&dentry->d_alias); if (parent) { dentry->d_parent = dget(parent); @@ -2278,19 +2279,110 @@ static void __init dcache_init_early(void) INIT_HLIST_HEAD(&dentry_hashtable[loop]); } +/* + * The slab allocator is holding off frees. We can safely examine + * the object without the danger of it vanishing from under us. + */ +static void *get_dentries(struct kmem_cache *s, int nr, void **v) +{ + struct dentry *dentry; + int i; + + spin_lock(&dcache_lock); + for (i = 0; i < nr; i++) { + dentry = v[i]; + + /* + * Three sorts of dentries cannot be reclaimed: + * + * 1. dentries that are in the process of being allocated + * or being freed. In that case the dentry is neither + * on the LRU nor hashed. + * + * 2. Fake hashed entries as used for anonymous dentries + * and pipe I/O. The fake hashed entries have d_flags + * set to indicate a hashed entry. However, the + * d_hash field indicates that the entry is not hashed. + * + * 3. dentries that have a backing store that is not + * writable. This is true for tmpsfs and other in + * memory filesystems. Removing dentries from them + * would loose dentries for good. + */ + if ((d_unhashed(dentry) && list_empty(&dentry->d_lru)) || + (!d_unhashed(dentry) && hlist_unhashed(&dentry->d_hash)) || + (dentry->d_inode && + !mapping_cap_writeback_dirty(dentry->d_inode->i_mapping))) + /* Ignore this dentry */ + v[i] = NULL; + else + /* dget_locked will remove the dentry from the LRU */ + dget_locked(dentry); + } + spin_unlock(&dcache_lock); + return NULL; +} + +/* + * Slab has dropped all the locks. Get rid of the refcount obtained + * earlier and also free the object. + */ +static void kick_dentries(struct kmem_cache *s, + int nr, void **v, void *private) +{ + struct dentry *dentry; + int i; + + /* + * First invalidate the dentries without holding the dcache lock + */ + for (i = 0; i < nr; i++) { + dentry = v[i]; + + if (dentry) + d_invalidate(dentry); + } + + /* + * If we are the last one holding a reference then the dentries can + * be freed. We need the dcache_lock. + */ + spin_lock(&dcache_lock); + for (i = 0; i < nr; i++) { + dentry = v[i]; + if (!dentry) + continue; + + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count) > 1) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + dput(dentry); + spin_lock(&dcache_lock); + continue; + } + + prune_one_dentry(dentry); + } + spin_unlock(&dcache_lock); + + /* + * dentries are freed using RCU so we need to wait until RCU + * operations are complete. + */ + synchronize_rcu(); +} + static void __init dcache_init(void) { int loop; - /* - * A constructor could be added for stable state like the lists, - * but it is probably not worth it because of the cache nature - * of the dcache. - */ - dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); - + dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD, + dcache_ctor); + register_shrinker(&dcache_shrinker); + kmem_cache_setup_defrag(dentry_cache, get_dentries, kick_dentries); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 3e5637fc3779..55f68b421547 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -58,7 +58,7 @@ static void drop_slab(void) int nr_objects; do { - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); + nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL); } while (nr_objects > 10); } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 647cd888ac87..cf26ff18a795 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,6 +171,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct ext2_inode_info, vfs_inode)); +} + static int init_inodecache(void) { ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", @@ -180,6 +186,9 @@ static int init_inodecache(void) init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; + + kmem_cache_setup_defrag(ext2_inode_cachep, + ext2_get_inodes, kick_inodes); return 0; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 89667d2ebf1d..e5dc7b062d62 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -489,6 +489,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct ext3_inode_info, vfs_inode)); +} + static int init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", @@ -498,6 +504,8 @@ static int init_inodecache(void) init_once); if (ext3_inode_cachep == NULL) return -ENOMEM; + kmem_cache_setup_defrag(ext3_inode_cachep, + ext3_get_inodes, kick_inodes); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7b3ba424f87a..fb91b242f829 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -556,6 +556,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct ext4_inode_info, vfs_inode)); +} + static int init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", @@ -565,6 +571,8 @@ static int init_inodecache(void) init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; + kmem_cache_setup_defrag(ext4_inode_cachep, + ext4_get_inodes, kick_inodes); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 098a2443196f..aab7feb17ad6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1407,6 +1407,128 @@ static int __init set_ihash_entries(char *str) __setup("ihash_entries=", set_ihash_entries); /* + * Obtain a refcount on a list of struct inodes pointed to by v. If the + * inode is in the process of being freed then zap the v[] entry so that + * we skip the freeing attempts later. + * + * This is a generic function for the ->get slab defrag callback. + */ +void *get_inodes(struct kmem_cache *s, int nr, void **v) +{ + int i; + + spin_lock(&inode_lock); + for (i = 0; i < nr; i++) { + struct inode *inode = v[i]; + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) + v[i] = NULL; + else + __iget(inode); + } + spin_unlock(&inode_lock); + return NULL; +} +EXPORT_SYMBOL(get_inodes); + +/* + * Function for filesystems that embedd struct inode into their own + * fs inode. The offset is the offset of the struct inode in the fs inode. + * + * The function adds to the pointers in v[] in order to make them point to + * struct inode. Then get_inodes() is used to get the refcount. + * The converted v[] pointers can then also be passed to the kick() callback + * without further processing. + */ +void *fs_get_inodes(struct kmem_cache *s, int nr, void **v, + unsigned long offset) +{ + int i; + + for (i = 0; i < nr; i++) + v[i] += offset; + + return get_inodes(s, nr, v); +} +EXPORT_SYMBOL(fs_get_inodes); + +/* + * Generic callback function slab defrag ->kick methods. Takes the + * array with inodes where we obtained refcounts using fs_get_inodes() + * or get_inodes() and tries to free them. + */ +void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private) +{ + struct inode *inode; + int i; + int abort = 0; + LIST_HEAD(freeable); + int active; + + for (i = 0; i < nr; i++) { + inode = v[i]; + if (!inode) + continue; + + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + if (remove_inode_buffers(inode)) + /* + * Should we really be doing this? Or + * limit the writeback here to only a few pages? + * + * Possibly an expensive operation but we + * cannot reclaim the inode if the pages + * are still present. + */ + invalidate_mapping_pages(&inode->i_data, + 0, -1); + } + + /* Invalidate children and dentry */ + if (S_ISDIR(inode->i_mode)) { + struct dentry *d = d_find_alias(inode); + + if (d) { + d_invalidate(d); + dput(d); + } + } + + if (inode->i_state & I_DIRTY) + write_inode_now(inode, 1); + + d_prune_aliases(inode); + } + + mutex_lock(&iprune_mutex); + for (i = 0; i < nr; i++) { + inode = v[i]; + + if (!inode) + /* inode is alrady being freed */ + continue; + + active = inode->i_sb->s_flags & MS_ACTIVE; + iput(inode); + if (abort || !active) + continue; + + spin_lock(&inode_lock); + abort = !can_unuse(inode); + + if (!abort) { + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + inodes_stat.nr_unused--; + } + spin_unlock(&inode_lock); + } + dispose_list(&freeable); + mutex_unlock(&iprune_mutex); +} +EXPORT_SYMBOL(kick_inodes); + +/* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) @@ -1445,6 +1567,7 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); + kmem_cache_setup_defrag(inode_cachep, get_inodes, kick_inodes); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2543fd00c658..bcb674275348 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -106,6 +106,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct proc_inode, vfs_inode)); +} + void __init proc_init_inodecache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", @@ -113,6 +119,8 @@ void __init proc_init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_PANIC), init_once); + kmem_cache_setup_defrag(proc_inode_cachep, + proc_get_inodes, kick_inodes); } static const struct super_operations proc_sops = { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 663a91f5dce8..0d4c1a2f0f74 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -532,6 +532,12 @@ static void init_once(void *foo) #endif } +static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct reiserfs_inode_info, vfs_inode)); +} + static int init_inodecache(void) { reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", @@ -542,6 +548,8 @@ static int init_inodecache(void) init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; + kmem_cache_setup_defrag(reiserfs_inode_cachep, + reiserfs_get_inodes, kick_inodes); return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 64f5ef8d0a55..a5e0b52463cb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1909,6 +1909,12 @@ static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } +/* Helper functions for inode defragmentation support in filesystems */ +extern void kick_inodes(struct kmem_cache *, int, void **, void *); +extern void *get_inodes(struct kmem_cache *, int nr, void **); +extern void *fs_get_inodes(struct kmem_cache *, int nr, void **, + unsigned long offset); + extern struct file * get_empty_filp(void); extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h new file mode 100644 index 000000000000..5bea8ead6a6b --- /dev/null +++ b/include/linux/kmemtrace.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include <linux/types.h> +#include <linux/marker.h> + +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + +#ifdef CONFIG_KMEMTRACE + +extern void kmemtrace_init(void); + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d", + type_id, call_site, (unsigned long) ptr, + (unsigned long) bytes_req, (unsigned long) bytes_alloc, + (unsigned long) gfp_flags, node); +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu", + type_id, call_site, (unsigned long) ptr); +} + +#else /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_init(void) +{ +} + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ +} + +#endif /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_mark_alloc_node(type_id, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f743418..76b407cf8493 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1263,8 +1263,7 @@ int in_gate_area_no_task(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages); - + unsigned long lru_pages, struct zone *z); #ifndef CONFIG_MMU #define randomize_va_space 0 #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 35a7b5e19465..d9e765269136 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -327,6 +327,7 @@ struct zone { unsigned long recent_scanned[2]; unsigned long pages_scanned; /* since last reclaim */ + unsigned long slab_defrag_counter; /* since last defrag */ unsigned long flags; /* zone flags, see below */ /* Zone statistics */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b12f93a3c345..487cd3b6bcfe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -117,6 +117,7 @@ enum pageflags { /* SLUB */ PG_slub_frozen = PG_active, PG_slub_debug = PG_error, + PG_slub_kickable = PG_dirty, }; #ifndef __GENERATING_BOUNDS_H @@ -201,6 +202,7 @@ __PAGEFLAG(SlobFree, slob_free) __PAGEFLAG(SlubFrozen, slub_frozen) __PAGEFLAG(SlubDebug, slub_debug) +__PAGEFLAG(SlubKickable, slub_kickable) /* * Only test-and-set exist for PG_writeback. The unconditional operators are diff --git a/include/linux/slab.h b/include/linux/slab.h index ba965c84ae06..5f5a9e7f4539 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -102,6 +102,59 @@ void kfree(const void *); size_t ksize(const void *); /* + * Function prototypes passed to kmem_cache_defrag() to enable defragmentation + * and targeted reclaim in slab caches. + */ + +/* + * kmem_cache_defrag_get_func() is called with locks held so that the slab + * objects cannot be freed. We are in an atomic context and no slab + * operations may be performed. The purpose of kmem_cache_defrag_get_func() + * is to obtain a stable refcount on the objects, so that they cannot be + * removed until kmem_cache_kick_func() has handled them. + * + * Parameters passed are the number of objects to process and an array of + * pointers to objects for which we need references. + * + * Returns a pointer that is passed to the kick function. If any objects + * cannot be moved then the pointer may indicate a failure and + * then kick can simply remove the references that were already obtained. + * + * The object pointer array passed is also passed to kmem_cache_defrag_kick(). + * The function may remove objects from the array by setting pointers to + * NULL. This is useful if we can determine that an object is already about + * to be removed. In that case it is often impossible to obtain the necessary + * refcount. + */ +typedef void *kmem_defrag_get_func(struct kmem_cache *, int, void **); + +/* + * kmem_cache_defrag_kick_func is called with no locks held and interrupts + * enabled. Sleeping is possible. Any operation may be performed in kick(). + * kmem_cache_defrag should free all the objects in the pointer array. + * + * Parameters passed are the number of objects in the array, the array of + * pointers to the objects and the pointer returned by kmem_cache_defrag_get(). + * + * Success is checked by examining the number of remaining objects in the slab. + */ +typedef void kmem_defrag_kick_func(struct kmem_cache *, int, void **, void *); + +/* + * kmem_cache_setup_defrag() is used to setup callbacks for a slab cache. + * kmem_cache_defrag() performs the actual defragmentation. + */ +#ifdef CONFIG_SLUB +void kmem_cache_setup_defrag(struct kmem_cache *, kmem_defrag_get_func, + kmem_defrag_kick_func); +int kmem_cache_defrag(int node); +#else +static inline void kmem_cache_setup_defrag(struct kmem_cache *s, + kmem_defrag_get_func get, kmem_defrag_kick_func kiok) {} +static inline int kmem_cache_defrag(int node) { return 0; } +#endif + +/* * Allocator specific definitions. These are mainly used to establish optimized * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by * selecting the appropriate general cache at compile time. @@ -225,9 +278,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep, * request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_track_caller(size_t, gfp_t, void*); +extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, __builtin_return_address(0)) + __kmalloc_track_caller(size, flags, _RET_IP_) #else #define kmalloc_track_caller(size, flags) \ __kmalloc(size, flags) @@ -243,10 +296,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*); * allocation request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); +extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ - __builtin_return_address(0)) + _RET_IP_) #else #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node(size, flags, node) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 39c3a5eb8ebe..7555ce99f6d2 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,6 +14,7 @@ #include <asm/page.h> /* kmalloc_sizes.h needs PAGE_SIZE */ #include <asm/cache.h> /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include <linux/compiler.h> +#include <linux/kmemtrace.h> /* Size description struct for general caches. */ struct cache_sizes { @@ -28,8 +29,26 @@ extern struct cache_sizes malloc_sizes[]; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); -static inline void *kmalloc(size_t size, gfp_t flags) +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags); +extern size_t slab_buffer_size(struct kmem_cache *cachep); +#else +static __always_inline void * +kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) { + return kmem_cache_alloc(cachep, flags); +} +static inline size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return 0; +} +#endif + +static __always_inline void *kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *cachep; + void *ret; + if (__builtin_constant_p(size)) { int i = 0; @@ -50,10 +69,17 @@ static inline void *kmalloc(size_t size, gfp_t flags) found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) - return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, - flags); + cachep = malloc_sizes[i].cs_dmacachep; + else #endif - return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags); + cachep = malloc_sizes[i].cs_cachep; + + ret = kmem_cache_alloc_notrace(cachep, flags); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret, + size, slab_buffer_size(cachep), flags); + + return ret; } return __kmalloc(size, flags); } @@ -62,8 +88,25 @@ found: extern void *__kmalloc_node(size_t size, gfp_t flags, int node); extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); -static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid); +#else +static __always_inline void * +kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return kmem_cache_alloc_node(cachep, flags, nodeid); +} +#endif + +static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + struct kmem_cache *cachep; + void *ret; + if (__builtin_constant_p(size)) { int i = 0; @@ -84,11 +127,18 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) - return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep, - flags, node); + cachep = malloc_sizes[i].cs_dmacachep; + else #endif - return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep, - flags, node); + cachep = malloc_sizes[i].cs_cachep; + + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, + ret, size, slab_buffer_size(cachep), + flags, node); + + return ret; } return __kmalloc_node(size, flags, node); } diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h index 59a3fa476ab9..0ec00b39d006 100644 --- a/include/linux/slob_def.h +++ b/include/linux/slob_def.h @@ -3,14 +3,15 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); -static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) { return kmem_cache_alloc_node(cachep, flags, -1); } void *__kmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc_node(size, flags, node); } @@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) * kmalloc is the normal method of allocating memory * in the kernel. */ -static inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline void *kmalloc(size_t size, gfp_t flags) { return __kmalloc_node(size, flags, -1); } -static inline void *__kmalloc(size_t size, gfp_t flags) +static __always_inline void *__kmalloc(size_t size, gfp_t flags) { return kmalloc(size, flags); } diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b1aacd..5f8dfcf74b71 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,6 +10,7 @@ #include <linux/gfp.h> #include <linux/workqueue.h> #include <linux/kobject.h> +#include <linux/kmemtrace.h> enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -30,6 +31,12 @@ enum stat_item { DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ ORDER_FALLBACK, /* Number of times fallback was necessary */ + SHRINK_CALLS, /* Number of invocations of kmem_cache_shrink */ + SHRINK_ATTEMPT_DEFRAG, /* Slabs that were attempted to be reclaimed */ + SHRINK_EMPTY_SLAB, /* Shrink encountered and freed empty slab */ + SHRINK_SLAB_SKIPPED, /* Slab reclaim skipped an slab (busy etc) */ + SHRINK_SLAB_RECLAIMED, /* Successfully reclaimed slabs */ + SHRINK_OBJECT_RECLAIM_FAILED, /* Callbacks signaled busy objects */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { @@ -87,8 +94,18 @@ struct kmem_cache { gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *); + kmem_defrag_get_func *get; + kmem_defrag_kick_func *kick; + int inuse; /* Offset to metadata */ int align; /* Alignment */ + int defrag_ratio; /* + * Ratio used to check the percentage of + * objects allocate in a slab page. + * If less than this ratio is allocated + * then reclaim attempts are made. + */ + const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SLUB_DEBUG @@ -204,13 +221,31 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size) void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags); +#else +static __always_inline void * +kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return kmem_cache_alloc(s, gfpflags); +} +#endif + static __always_inline void *kmalloc_large(size_t size, gfp_t flags) { - return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size)); + unsigned int order = get_order(size); + void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret, + size, PAGE_SIZE << order, flags); + + return ret; } static __always_inline void *kmalloc(size_t size, gfp_t flags) { + void *ret; + if (__builtin_constant_p(size)) { if (size > PAGE_SIZE) return kmalloc_large(size, flags); @@ -221,7 +256,13 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) if (!s) return ZERO_SIZE_PTR; - return kmem_cache_alloc(s, flags); + ret = kmem_cache_alloc_notrace(s, flags); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + _THIS_IP_, ret, + size, s->size, flags); + + return ret; } } return __kmalloc(size, flags); @@ -231,8 +272,24 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) void *__kmalloc_node(size_t size, gfp_t flags, int node); void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node); +#else +static __always_inline void * +kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return kmem_cache_alloc_node(s, gfpflags, node); +} +#endif + static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + void *ret; + if (__builtin_constant_p(size) && size <= PAGE_SIZE && !(flags & SLUB_DMA)) { struct kmem_cache *s = kmalloc_slab(size); @@ -240,7 +297,13 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) if (!s) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node(s, flags, node); + ret = kmem_cache_alloc_node_notrace(s, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _THIS_IP_, ret, + size, s->size, flags, node); + + return ret; } return __kmalloc_node(size, flags, node); } diff --git a/include/linux/swap.h b/include/linux/swap.h index a3af95b2cb6d..e9584191067f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -217,6 +217,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int slab_defrag_limit; +extern int slab_defrag_counter; + extern int remove_mapping(struct address_space *mapping, struct page *page); extern long vm_total_pages; diff --git a/init/Kconfig b/init/Kconfig index de6287439dbd..9255bfdf8675 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -771,8 +771,7 @@ config SLAB help The regular slab allocator that is established and known to work well in all environments. It organizes cache hot objects in - per cpu and per node queues. SLAB is the default choice for - a slab allocator. + per cpu and per node queues. config SLUB bool "SLUB (Unqueued Allocator)" @@ -781,7 +780,8 @@ config SLUB instead of managing queues of cached objects (SLAB approach). Per cpu caching is realized using slabs of objects instead of queues of objects. SLUB can use memory efficiently - and has enhanced diagnostics. + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. config SLOB depends on EMBEDDED diff --git a/init/main.c b/init/main.c index 8e48761a5523..b8e56411a1c8 100644 --- a/init/main.c +++ b/init/main.c @@ -70,6 +70,7 @@ #include <asm/setup.h> #include <asm/sections.h> #include <asm/cacheflush.h> +#include <linux/kmemtrace.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> @@ -660,6 +661,7 @@ asmlinkage void __init start_kernel(void) enable_debug_pagealloc(); cpu_hotplug_init(); kmem_cache_init(); + kmemtrace_init(); debug_objects_mem_init(); idr_init_cache(); setup_per_cpu_pageset(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 65d4a9ba79e4..348026447a22 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1088,6 +1088,26 @@ static struct ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slab_defrag_limit", + .data = &slab_defrag_limit, + .maxlen = sizeof(slab_defrag_limit), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &one_hundred, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slab_defrag_count", + .data = &slab_defrag_counter, + .maxlen = sizeof(slab_defrag_counter), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { .ctl_name = VM_LEGACY_VA_LAYOUT, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1e3fd3e3436a..7854bc8ac596 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -813,6 +813,26 @@ config FIREWIRE_OHCI_REMOTE_DMA If unsure, say N. +config KMEMTRACE + bool "Kernel memory tracer (kmemtrace)" + depends on RELAY && DEBUG_FS && MARKERS + help + kmemtrace provides tracing for slab allocator functions, such as + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + data is then fed to the userspace application in order to analyse + allocation hotspots, internal fragmentation and so on, making it + possible to see how well an allocator performs, as well as debug + and profile kernel code. + + This requires an userspace application to use. See + Documentation/vm/kmemtrace.txt for more information. + + Saying Y will make the kernel somewhat larger and slower. However, + if you disable kmemtrace at run-time or boot-time, the performance + impact is minimal (depending on the arch the kernel is built for). + + If unsure, say N. + menuconfig BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK diff --git a/mm/Makefile b/mm/Makefile index c06b45a1ff5f..3782eb66d4b3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c new file mode 100644 index 000000000000..2a70a805027c --- /dev/null +++ b/mm/kmemtrace.c @@ -0,0 +1,333 @@ +/* + * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#include <linux/string.h> +#include <linux/debugfs.h> +#include <linux/relay.h> +#include <linux/module.h> +#include <linux/marker.h> +#include <linux/gfp.h> +#include <linux/kmemtrace.h> + +#define KMEMTRACE_SUBBUF_SIZE 524288 +#define KMEMTRACE_DEF_N_SUBBUFS 20 + +static struct rchan *kmemtrace_chan; +static u32 kmemtrace_buf_overruns; + +static unsigned int kmemtrace_n_subbufs; + +/* disabled by default */ +static unsigned int kmemtrace_enabled; + +/* + * The sequence number is used for reordering kmemtrace packets + * in userspace, since they are logged as per-CPU data. + * + * atomic_t should always be a 32-bit signed integer. Wraparound is not + * likely to occur, but userspace can deal with it by expecting a certain + * sequence number in the next packet that will be read. + */ +static atomic_t kmemtrace_seq_num; + +#define KMEMTRACE_ABI_VERSION 1 + +static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION; + +enum kmemtrace_event_id { + KMEMTRACE_EVENT_ALLOC = 0, + KMEMTRACE_EVENT_FREE, +}; + +struct kmemtrace_event { + u8 event_id; + u8 type_id; + u16 event_size; + s32 seq_num; + u64 call_site; + u64 ptr; +} __attribute__ ((__packed__)); + +struct kmemtrace_stats_alloc { + u64 bytes_req; + u64 bytes_alloc; + u32 gfp_flags; + s32 numa_node; +} __attribute__ ((__packed__)); + +static void kmemtrace_probe_alloc(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + struct kmemtrace_stats_alloc *stats; + void *buf; + + local_irq_save(flags); + + buf = relay_reserve(kmemtrace_chan, + sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc)); + if (!buf) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + + ev = buf; + ev->event_id = KMEMTRACE_EVENT_ALLOC; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + + stats = buf + sizeof(struct kmemtrace_event); + stats->bytes_req = va_arg(*args, unsigned long); + stats->bytes_alloc = va_arg(*args, unsigned long); + stats->gfp_flags = va_arg(*args, unsigned long); + stats->numa_node = va_arg(*args, int); + +failed: + local_irq_restore(flags); +} + +static void kmemtrace_probe_free(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + + local_irq_save(flags); + + ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event)); + if (!ev) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + ev->event_id = KMEMTRACE_EVENT_FREE; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + +failed: + local_irq_restore(flags); +} + +static struct dentry * +kmemtrace_create_buf_file(const char *filename, struct dentry *parent, + int mode, struct rchan_buf *buf, int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static int kmemtrace_remove_buf_file(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static int kmemtrace_subbuf_start(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + if (relay_buf_full(buf)) { + /* + * We know it's not SMP-safe, but neither + * debugfs_create_u32() is. + */ + kmemtrace_buf_overruns++; + return 0; + } + + return 1; +} + +static struct rchan_callbacks relay_callbacks = { + .create_buf_file = kmemtrace_create_buf_file, + .remove_buf_file = kmemtrace_remove_buf_file, + .subbuf_start = kmemtrace_subbuf_start, +}; + +static struct dentry *kmemtrace_dir; +static struct dentry *kmemtrace_overruns_dentry; +static struct dentry *kmemtrace_abi_version_dentry; + +static struct dentry *kmemtrace_enabled_dentry; + +static int kmemtrace_start_probes(void) +{ + int err; + + err = marker_probe_register("kmemtrace_alloc", "type_id %d " + "call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu " + "gfp_flags %lu node %d", + kmemtrace_probe_alloc, NULL); + if (err) + return err; + err = marker_probe_register("kmemtrace_free", "type_id %d " + "call_site %lu ptr %lu", + kmemtrace_probe_free, NULL); + + return err; +} + +static void kmemtrace_stop_probes(void) +{ + marker_probe_unregister("kmemtrace_alloc", + kmemtrace_probe_alloc, NULL); + marker_probe_unregister("kmemtrace_free", + kmemtrace_probe_free, NULL); +} + +static int kmemtrace_enabled_get(void *data, u64 *val) +{ + *val = *((int *) data); + + return 0; +} + +static int kmemtrace_enabled_set(void *data, u64 val) +{ + u64 old_val = kmemtrace_enabled; + + *((int *) data) = !!val; + + if (old_val == val) + return 0; + if (val) + kmemtrace_start_probes(); + else + kmemtrace_stop_probes(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops, + kmemtrace_enabled_get, + kmemtrace_enabled_set, "%llu\n"); + +static void kmemtrace_cleanup(void) +{ + if (kmemtrace_enabled_dentry) + debugfs_remove(kmemtrace_enabled_dentry); + + kmemtrace_stop_probes(); + + if (kmemtrace_abi_version_dentry) + debugfs_remove(kmemtrace_abi_version_dentry); + if (kmemtrace_overruns_dentry) + debugfs_remove(kmemtrace_overruns_dentry); + + relay_close(kmemtrace_chan); + kmemtrace_chan = NULL; + + if (kmemtrace_dir) + debugfs_remove(kmemtrace_dir); +} + +static int __init kmemtrace_setup_late(void) +{ + if (!kmemtrace_chan) + goto failed; + + kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL); + if (!kmemtrace_dir) + goto cleanup; + + kmemtrace_abi_version_dentry = + debugfs_create_u32("abi_version", S_IRUSR, + kmemtrace_dir, &kmemtrace_abi_version); + kmemtrace_overruns_dentry = + debugfs_create_u32("total_overruns", S_IRUSR, + kmemtrace_dir, &kmemtrace_buf_overruns); + if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry) + goto cleanup; + + kmemtrace_enabled_dentry = + debugfs_create_file("enabled", S_IRUSR | S_IWUSR, + kmemtrace_dir, &kmemtrace_enabled, + &kmemtrace_enabled_fops); + if (!kmemtrace_enabled_dentry) + goto cleanup; + + if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir)) + goto cleanup; + + printk(KERN_INFO "kmemtrace: fully up.\n"); + + return 0; + +cleanup: + kmemtrace_cleanup(); +failed: + return 1; +} +late_initcall(kmemtrace_setup_late); + +static int __init kmemtrace_set_boot_enabled(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "yes")) + kmemtrace_enabled = 1; + else if (!strcmp(str, "no")) + kmemtrace_enabled = 0; + else + return -EINVAL; + + return 0; +} +early_param("kmemtrace.enable", kmemtrace_set_boot_enabled); + +static int __init kmemtrace_set_subbufs(char *str) +{ + get_option(&str, &kmemtrace_n_subbufs); + return 0; +} +early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); + +void kmemtrace_init(void) +{ + if (!kmemtrace_n_subbufs) + kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; + + kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, + kmemtrace_n_subbufs, &relay_callbacks, + NULL); + if (!kmemtrace_chan) { + printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); + return; + } + + if (!kmemtrace_enabled) { + printk(KERN_INFO "kmemtrace: disabled. Pass " + "kemtrace.enable=yes as kernel parameter for " + "boot-time tracing.\n"); + return; + } + if (kmemtrace_start_probes()) { + printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); + kmemtrace_cleanup(); + return; + } + + printk(KERN_INFO "kmemtrace: enabled.\n"); +} + diff --git a/mm/slab.c b/mm/slab.c index 09187517f9dc..1fcb32b1dad5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -112,6 +112,7 @@ #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> +#include <linux/kmemtrace.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#ifdef CONFIG_KMEMTRACE +size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return cachep->buffer_size; +} +EXPORT_SYMBOL(slab_buffer_size); +#endif + /* * Do not go above this order unless 0 objects fit into the slab. */ @@ -2997,7 +3006,7 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + BUG_ON(slabp->inuse >= cachep->num); while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); @@ -3613,10 +3622,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, flags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +{ + return __cache_alloc(cachep, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + /** * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against @@ -3661,23 +3683,47 @@ out: #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, + flags, nodeid); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; + void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node(cachep, flags, node); + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags, node); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3686,9 +3732,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, void *caller) + int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, caller); + return __do_kmalloc_node(size, flags, node, (void *)caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); #else @@ -3710,6 +3756,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) { struct kmem_cache *cachep; + void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. @@ -3719,20 +3766,26 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return __cache_alloc(cachep, flags, caller); + ret = __cache_alloc(cachep, flags, caller); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) { - return __do_kmalloc(size, flags, caller); + return __do_kmalloc(size, flags, (void *)caller); } EXPORT_SYMBOL(__kmalloc_track_caller); @@ -3762,6 +3815,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp); } EXPORT_SYMBOL(kmem_cache_free); @@ -3788,6 +3843,8 @@ void kfree(const void *objp) debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp); } EXPORT_SYMBOL(kfree); diff --git a/mm/slob.c b/mm/slob.c index cb675d126791..55de44ae5d30 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,6 +65,7 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> +#include <linux/kmemtrace.h> #include <asm/atomic.h> /* @@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + void *ret; if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; m = slob_alloc(size + align, gfp, align, node); + if (!m) return NULL; *m = size; - return (void *)m + align; + ret = (void *)m + align; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, size + align, gfp, node); } else { - void *ret; + unsigned int order = get_order(size); - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_page(gfp | __GFP_COMP, order, node); if (ret) { struct page *page; page = virt_to_page(ret); page->private = size; } - return ret; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << order, gfp, node); } + + return ret; } EXPORT_SYMBOL(__kmalloc_node); @@ -501,6 +513,8 @@ void kfree(const void *block) slob_free(m, *m + align); } else put_page(&sp->page); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block); } EXPORT_SYMBOL(kfree); @@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; - if (c->size < PAGE_SIZE) + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - else + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { b = slob_new_page(flags, get_order(c->size), node); + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + PAGE_SIZE << get_order(c->size), + flags, node); + } if (c->ctor) c->ctor(b); @@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } else { __kmem_cache_free(b, c->size); } + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index 7ad489af9561..3c5bba3b4dc7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -24,6 +24,7 @@ #include <linux/kallsyms.h> #include <linux/memory.h> #include <linux/math64.h> +#include <linux/kmemtrace.h> /* * Lock order: @@ -128,10 +129,10 @@ /* * Maximum number of desirable partial slabs. - * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * More slabs cause kmem_cache_shrink to sort the slabs by objects + * and triggers slab defragmentation. */ -#define MAX_PARTIAL 10 +#define MAX_PARTIAL 20 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -153,6 +154,10 @@ #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ @@ -174,11 +179,14 @@ static enum { static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); +/* Maximum objects in defragmentable slabs */ +static unsigned int max_defrag_slab_objects; + /* * Tracking user of a slab. */ struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -290,7 +298,7 @@ static inline struct kmem_cache_order_objects oo_make(int order, unsigned long size) { struct kmem_cache_order_objects x = { - (order << 16) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + (PAGE_SIZE << order) / size }; return x; @@ -298,12 +306,12 @@ static inline struct kmem_cache_order_objects oo_make(int order, static inline int oo_order(struct kmem_cache_order_objects x) { - return x.x >> 16; + return x.x >> OO_SHIFT; } static inline int oo_objects(struct kmem_cache_order_objects x) { - return x.x & ((1 << 16) - 1); + return x.x & OO_MASK; } #ifdef CONFIG_SLUB_DEBUG @@ -367,7 +375,7 @@ static struct track *get_track(struct kmem_cache *s, void *object, } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { struct track *p; @@ -391,8 +399,8 @@ static void init_tracking(struct kmem_cache *s, void *object) if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@ -401,7 +409,7 @@ static void print_track(const char *s, struct track *t) return; printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) @@ -764,8 +772,8 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } max_objects = (PAGE_SIZE << compound_order(page)) / s->size; - if (max_objects > 65535) - max_objects = 65535; + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { slab_err(s, page, "Wrong number of objects. Found %d but " @@ -866,7 +874,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, } static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; @@ -906,7 +914,7 @@ bad: } static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -1029,10 +1037,10 @@ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1128,6 +1136,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) SLAB_STORE_USER | SLAB_TRACE)) __SetPageSlubDebug(page); + if (s->kick) + __SetPageSlubKickable(page); + start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) @@ -1168,6 +1179,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlubKickable(page); __ClearPageSlab(page); reset_page_mapcount(page); __free_pages(page, order); @@ -1378,6 +1390,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) if (SLABDEBUG && PageSlubDebug(page) && (s->flags & SLAB_STORE_USER)) add_full(n, page); + if (s->kick) + __SetPageSlubKickable(page); } slab_unlock(page); } else { @@ -1499,8 +1513,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { void **object; struct page *new; @@ -1584,7 +1598,7 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; @@ -1613,18 +1627,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1634,7 +1676,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1704,7 +1746,7 @@ debug: * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; struct kmem_cache_cpu *c; @@ -1731,11 +1773,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, page, x, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab object the object resides */ +/* Figure out on which slab page the object resides */ static struct page *get_object_page(const void *x) { struct page *page = virt_to_head_page(x); @@ -1807,8 +1851,8 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > 65535) - return get_order(size * 65535) - 1; + if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); @@ -2313,6 +2357,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, goto error; s->refcount = 1; + s->defrag_ratio = 30; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -2519,7 +2564,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, flags, NULL)) goto panic; - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto panic; @@ -2582,7 +2627,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) goto unlock_out; } - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; schedule_work(&sysfs_add_work); @@ -2650,6 +2695,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, flags); @@ -2659,7 +2705,12 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -2678,16 +2729,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + if (unlikely(size > PAGE_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + ret = slab_alloc(s, flags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2744,81 +2809,291 @@ void kfree(const void *x) put_page(page); return; } - slab_free(page->slab, page, object, __builtin_return_address(0)); + slab_free(page->slab, page, object, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x); } EXPORT_SYMBOL(kfree); /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * Allocate a slab scratch space that is sufficient to keep at least + * max_defrag_slab_objects pointers to individual objects and also a bitmap + * for max_defrag_slab_objects. + */ +static inline void *alloc_scratch(void) +{ + return kmalloc(max_defrag_slab_objects * sizeof(void *) + + BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long), + GFP_KERNEL); +} + +void kmem_cache_setup_defrag(struct kmem_cache *s, + kmem_defrag_get_func get, kmem_defrag_kick_func kick) +{ + int max_objects = oo_objects(s->max); + + /* + * Defragmentable slabs must have a ctor otherwise objects may be + * in an undetermined state after they are allocated. + */ + BUG_ON(!s->ctor); + s->get = get; + s->kick = kick; + down_write(&slub_lock); + list_move(&s->list, &slab_caches); + if (max_objects > max_defrag_slab_objects) + max_defrag_slab_objects = max_objects; + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_setup_defrag); + +/* + * Vacate all objects in the given slab. * - * The slabs with the least items are placed last. This results in them - * being allocated from last increasing the chance that the last objects - * are freed in them. + * The scratch aread passed to list function is sufficient to hold + * struct listhead times objects per slab. We use it to hold void ** times + * objects per slab plus a bitmap for each object. */ -int kmem_cache_shrink(struct kmem_cache *s) +static int kmem_cache_vacate(struct page *page, void *scratch) { - int node; - int i; - struct kmem_cache_node *n; - struct page *page; - struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + void **vector = scratch; + void *p; + void *addr = page_address(page); + struct kmem_cache *s; + unsigned long *map; + int leftover; + int count; + void *private; unsigned long flags; + unsigned long objects; + struct kmem_cache_cpu *c; - if (!slabs_by_inuse) - return -ENOMEM; + local_irq_save(flags); + slab_lock(page); - flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); + BUG_ON(!PageSlab(page)); /* Must be s slab page */ + BUG_ON(!PageSlubFrozen(page)); /* Slab must have been frozen earlier */ - if (!n->nr_partial) - continue; + s = page->slab; + objects = page->objects; + map = scratch + objects * sizeof(void **); + if (!page->inuse || !s->kick || !PageSlubKickable(page)) + goto out; - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + /* Determine used objects */ + bitmap_fill(map, objects); + for_each_free_object(p, s, page->freelist) + __clear_bit(slab_index(p, s, addr), map); - spin_lock_irqsave(&n->list_lock, flags); + /* Build vector of pointers to objects */ + count = 0; + memset(vector, 0, objects * sizeof(void **)); + for_each_object(p, s, addr, objects) + if (test_bit(slab_index(p, s, addr), map)) + vector[count++] = p; - /* - * Build lists indexed by the items in use in each slab. - * - * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. - */ - list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { + private = s->get(s, count, vector); + + /* + * Got references. Now we can drop the slab lock. The slab + * is frozen so it cannot vanish from under us nor will + * allocations be performed on the slab. However, unlocking the + * slab will allow concurrent slab_frees to proceed. + */ + slab_unlock(page); + local_irq_restore(flags); + + /* + * Perform the KICK callbacks to remove the objects. + */ + s->kick(s, count, vector, private); + + local_irq_save(flags); + slab_lock(page); +out: + /* + * Check the result and unfreeze the slab + */ + leftover = page->inuse; + c = get_cpu_slab(s, smp_processor_id()); + if (leftover) { + /* Unsuccessful reclaim. Avoid future reclaim attempts. */ + stat(c, SHRINK_OBJECT_RECLAIM_FAILED); + __ClearPageSlubKickable(page); + } else + stat(c, SHRINK_SLAB_RECLAIMED); + unfreeze_slab(s, page, leftover > 0); + local_irq_restore(flags); + return leftover; +} + +/* + * Remove objects from a list of slab pages that have been gathered. + * Must be called with slabs that have been isolated before. + * + * kmem_cache_reclaim() is never called from an atomic context. It + * allocates memory for temporary storage. We are holding the + * slub_lock semaphore which prevents another call into + * the defrag logic. + */ +int kmem_cache_reclaim(struct list_head *zaplist) +{ + int freed = 0; + void **scratch; + struct page *page; + struct page *page2; + + if (list_empty(zaplist)) + return 0; + + scratch = alloc_scratch(); + if (!scratch) + return 0; + + list_for_each_entry_safe(page, page2, zaplist, lru) { + list_del(&page->lru); + if (kmem_cache_vacate(page, scratch) == 0) + freed++; + } + kfree(scratch); + return freed; +} + +/* + * Shrink the slab cache on a particular node of the cache + * by releasing slabs with zero objects and trying to reclaim + * slabs with less than the configured percentage of objects allocated. + */ +static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node, + unsigned long limit) +{ + unsigned long flags; + struct page *page, *page2; + LIST_HEAD(zaplist); + int freed = 0; + struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_cpu *c; + + if (n->nr_partial <= limit) + return 0; + + spin_lock_irqsave(&n->list_lock, flags); + c = get_cpu_slab(s, smp_processor_id()); + stat(c, SHRINK_CALLS); + list_for_each_entry_safe(page, page2, &n->partial, lru) { + if (!slab_trylock(page)) + /* Busy slab. Get out of the way */ + continue; + + if (page->inuse) { + if (!PageSlubKickable(page) || page->inuse * 100 >= + s->defrag_ratio * page->objects) { + slab_unlock(page); /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. + * Slab contains enough objects + * or we alrady tried reclaim before and + * it failed. Skip this one. */ - list_del(&page->lru); + continue; + } + + list_move(&page->lru, &zaplist); + if (s->kick) { + stat(c, SHRINK_ATTEMPT_DEFRAG); n->nr_partial--; - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); + __SetPageSlubFrozen(page); } + slab_unlock(page); + } else { + /* Empty slab page */ + stat(c, SHRINK_EMPTY_SLAB); + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + freed++; } + } + if (!s->kick) /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * No defrag methods. By simply putting the zaplist at the + * end of the partial list we can let them simmer longer + * and thus increase the chance of all objects being + * reclaimed. + * + * We have effectively sorted the partial list and put + * the slabs with more objects first. As soon as they + * are allocated they are going to be removed from the + * partial list. */ - for (i = objects - 1; i >= 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + list_splice(&zaplist, n->partial.prev); - spin_unlock_irqrestore(&n->list_lock, flags); + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (s->kick) + freed += kmem_cache_reclaim(&zaplist); + + return freed; +} + +/* + * Defrag slabs conditional on the amount of fragmentation in a page. + */ +int kmem_cache_defrag(int node) +{ + struct kmem_cache *s; + unsigned long slabs = 0; + + /* + * kmem_cache_defrag may be called from the reclaim path which may be + * called for any page allocator alloc. So there is the danger that we + * get called in a situation where slub already acquired the slub_lock + * for other purposes. + */ + if (!down_read_trylock(&slub_lock)) + return 0; + + list_for_each_entry(s, &slab_caches, list) { + unsigned long reclaimed = 0; + + /* + * Defragmentable caches come first. If the slab cache is not + * defragmentable then we can stop traversing the list. + */ + if (!s->kick) + break; + + if (node == -1) { + int nid; + + for_each_node_state(nid, N_NORMAL_MEMORY) + reclaimed += __kmem_cache_shrink(s, nid, + MAX_PARTIAL); + } else + reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL); + + slabs += reclaimed; } + up_read(&slub_lock); + return slabs; +} +EXPORT_SYMBOL(kmem_cache_defrag); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists. + * If the slab cache supports defragmentation then objects are + * reclaimed. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) + __kmem_cache_shrink(s, node, 0); - kfree(slabs_by_inuse); return 0; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -3041,7 +3316,7 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if (s->ctor) + if (s->ctor || s->kick || s->get) return 1; /* @@ -3130,7 +3405,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto err; @@ -3200,9 +3475,10 @@ static struct notifier_block __cpuinitdata slab_notifier = { #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, gfpflags); @@ -3212,13 +3488,20 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size, + s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large_node(size, gfpflags, node); @@ -3228,7 +3511,13 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc(s, gfpflags, node, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret, + size, s->size, gfpflags, node); + + return ret; } #ifdef CONFIG_SLUB_DEBUG @@ -3427,7 +3716,7 @@ static void resiliency_test(void) {}; struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; @@ -3475,7 +3764,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; @@ -3815,16 +4104,32 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); -static ssize_t ctor_show(struct kmem_cache *s, char *buf) +static ssize_t ops_show(struct kmem_cache *s, char *buf) { + int x = 0; + if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); + x += sprintf(buf + x, "ctor : "); + x += sprint_symbol(buf + x, (unsigned long)s->ctor); + x += sprintf(buf + x, "\n"); + } - return n + sprintf(buf + n, "\n"); + if (s->get) { + x += sprintf(buf + x, "get : "); + x += sprint_symbol(buf + x, + (unsigned long)s->get); + x += sprintf(buf + x, "\n"); } - return 0; + + if (s->kick) { + x += sprintf(buf + x, "kick : "); + x += sprint_symbol(buf + x, + (unsigned long)s->kick); + x += sprintf(buf + x, "\n"); + } + return x; } -SLAB_ATTR_RO(ctor); +SLAB_ATTR_RO(ops); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { @@ -4044,6 +4349,27 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(free_calls); +static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->defrag_ratio); +} + +static ssize_t defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long ratio; + int err; + + err = strict_strtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio < 100) + s->defrag_ratio = ratio; + return length; +} +SLAB_ATTR(defrag_ratio); + #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4123,6 +4449,12 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(SHRINK_CALLS, shrink_calls); +STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag); +STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab); +STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped); +STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed); +STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed); #endif static struct attribute *slab_attrs[] = { @@ -4136,7 +4468,7 @@ static struct attribute *slab_attrs[] = { &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, - &ctor_attr.attr, + &ops_attr.attr, &aliases_attr.attr, &align_attr.attr, &sanity_checks_attr.attr, @@ -4151,6 +4483,7 @@ static struct attribute *slab_attrs[] = { &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, + &defrag_ratio_attr.attr, #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4176,6 +4509,12 @@ static struct attribute *slab_attrs[] = { &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, + &shrink_calls_attr.attr, + &shrink_attempt_defrag_attr.attr, + &shrink_empty_slab_attr.attr, + &shrink_slab_skipped_attr.attr, + &shrink_slab_reclaimed_attr.attr, + &shrink_object_reclaim_failed_attr.attr, #endif NULL }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b5860294bb6..c8e26a810f8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -151,6 +151,14 @@ void unregister_shrinker(struct shrinker *shrinker) EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +/* + * Trigger a call into slab defrag if the sum of the returns from + * shrinkers cross this value. + */ +int slab_defrag_limit = 1000; +int slab_defrag_counter; + /* * Call the shrink functions to age shrinkable caches * @@ -168,10 +176,18 @@ EXPORT_SYMBOL(unregister_shrinker); * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. * + * zone is the zone for which we are shrinking the slabs. If the intent + * is to do a global shrink then zone may be NULL. Specification of a + * zone is currently only used to limit slab defragmentation to a NUMA node. + * The performace of shrink_slab would be better (in particular under NUMA) + * if it could be targeted as a whole to the zone that is under memory + * pressure but the VFS infrastructure does not allow that at the present + * time. + * * Returns the number of slab objects which we shrunk. */ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) + unsigned long lru_pages, struct zone *zone) { struct shrinker *shrinker; unsigned long ret = 0; @@ -228,6 +244,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, shrinker->nr += total_scan; } up_read(&shrinker_rwsem); + + + /* Avoid dirtying cachelines */ + if (!ret) + return 0; + + /* + * "ret" doesnt really contain the freed object count. The shrinkers + * fake it. Gotta go with what we are getting though. + * + * Handling of the defrag_counter is also racy. If we get the + * wrong counts then we may unnecessarily do a defrag pass or defer + * one. "ret" is already faked. So this is just increasing + * the already existing fuzziness to get some notion as to when + * to initiate slab defrag which will hopefully be okay. + */ + if (zone) { + /* balance_pgdat running on a zone so we only scan one node */ + zone->slab_defrag_counter += ret; + if (zone->slab_defrag_counter > slab_defrag_limit && + (gfp_mask & __GFP_FS)) { + zone->slab_defrag_counter = 0; + kmem_cache_defrag(zone_to_nid(zone)); + } + } else { + /* Direct (and thus global) reclaim. Scan all nodes */ + slab_defrag_counter += ret; + if (slab_defrag_counter > slab_defrag_limit && + (gfp_mask & __GFP_FS)) { + slab_defrag_counter = 0; + kmem_cache_defrag(-1); + } + } return ret; } @@ -1583,7 +1632,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * over limit cgroups */ if (scan_global_lru(sc)) { - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -1817,7 +1866,7 @@ loop_again: nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); + lru_pages, zone); nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) @@ -2057,7 +2106,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL); if (!reclaim_state.reclaimed_slab) break; @@ -2095,7 +2144,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); + global_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -2112,7 +2161,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -2274,7 +2323,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + while (shrink_slab(sc.nr_scanned, gfp_mask, order, + zone) && zone_page_state(zone, NR_SLAB_RECLAIMABLE) > slab_reclaimable - nr_pages) ; diff --git a/mm/vmstat.c b/mm/vmstat.c index c3ccfda23adc..c4b7280e6e70 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,11 +774,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, + "\n slab_defrag_count: %lu" "\n all_unreclaimable: %u" "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone_is_all_unreclaimable(zone), + zone->slab_defrag_counter, + zone_is_all_unreclaimable(zone), zone->prev_priority, zone->zone_start_pfn, zone->inactive_ratio); |