summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile5
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/backing-dev.c533
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/page-writeback.c162
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/percpu.c1316
-rw-r--r--mm/quicklist.c5
-rw-r--r--mm/slqb.c3765
-rw-r--r--mm/slub.c45
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/vmscan.c2
13 files changed, 5421 insertions, 717 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..b2b96c27d520 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page_alloc.o page-writeback.o pdflush.o \
+ maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
@@ -28,12 +28,13 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_SLQB) += slqb.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
obj-$(CONFIG_SMP) += percpu.o
else
obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
*/
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/bootmem.h>
+#include <asm/sections.h>
#ifndef cache_line_size
#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
kfree(__percpu_disguise(__pdata));
}
EXPORT_SYMBOL_GPL(free_percpu);
+
+/*
+ * Generic percpu area setup.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long size, i;
+ char *ptr;
+ unsigned long nr_possible_cpus = num_possible_cpus();
+
+ /* Copy section for each CPU (we discard the original) */
+ size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+ ptr = alloc_bootmem_pages(size * nr_possible_cpus);
+
+ for_each_possible_cpu(i) {
+ __per_cpu_offset[i] = ptr - __per_cpu_start;
+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+ ptr += size;
+ }
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..036b07ba393b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
#include <linux/wait.h>
#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
+ .name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
+DEFINE_SPINLOCK(bdi_lock);
+LIST_HEAD(bdi_list);
+LIST_HEAD(bdi_pending_list);
+
+static struct task_struct *sync_supers_tsk;
+static struct timer_list sync_supers_timer;
+
+static int bdi_sync_supers(void *);
+static void sync_supers_timer_fn(unsigned long);
+static void arm_supers_timer(void);
+
+static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
+ struct bdi_writeback *wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+ struct inode *inode;
+
+ /*
+ * inode lock is enough here, the bdi->wb_list is protected by
+ * RCU on the reader side
+ */
+ nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+ spin_lock(&inode_lock);
+ list_for_each_entry(wb, &bdi->wb_list, list) {
+ nr_wb++;
+ list_for_each_entry(inode, &wb->b_dirty, i_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_list)
+ nr_more_io++;
+ }
+ spin_unlock(&inode_lock);
get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiReclaimable: %8lu kB\n"
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n",
+ "BackgroundThresh: %8lu kB\n"
+ "WriteBack threads:%8lu\n"
+ "b_dirty: %8lu\n"
+ "b_io: %8lu\n"
+ "b_more_io: %8lu\n"
+ "bdi_list: %8u\n"
+ "state: %8lx\n"
+ "wb_mask: %8lx\n"
+ "wb_list: %8u\n"
+ "wb_cnt: %8u\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh),
- K(dirty_thresh),
- K(background_thresh));
+ K(bdi_thresh), K(dirty_thresh),
+ K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
+ !list_empty(&bdi->wb_list), bdi->wb_cnt);
#undef K
return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
{
int err;
+ sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
+ BUG_ON(IS_ERR(sync_supers_tsk));
+
+ init_timer(&sync_supers_timer);
+ setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
+ arm_supers_timer();
+
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,390 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
+static int wb_assign_nr(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+{
+ unsigned long mask = BDI_MAX_FLUSHERS - 1;
+ unsigned int nr;
+
+ do {
+ if ((bdi->wb_mask & mask) == mask)
+ return 1;
+
+ nr = find_first_zero_bit(&bdi->wb_mask, BDI_MAX_FLUSHERS);
+ } while (test_and_set_bit(nr, &bdi->wb_mask));
+
+ wb->nr = nr;
+
+ spin_lock(&bdi->wb_lock);
+ bdi->wb_cnt++;
+ spin_unlock(&bdi->wb_lock);
+
+ return 0;
+}
+
+static void bdi_put_wb(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+{
+ /*
+ * If this is the default wb thread exiting, leave the bit set
+ * in the wb mask as we set that before it's created as well. This
+ * is done to make sure that assigned work with no thread has at
+ * least one receipient.
+ */
+ if (wb == &bdi->wb)
+ clear_bit(BDI_wb_alloc, &bdi->state);
+ else {
+ clear_bit(wb->nr, &bdi->wb_mask);
+ kfree(wb);
+ spin_lock(&bdi->wb_lock);
+ bdi->wb_cnt--;
+ spin_unlock(&bdi->wb_lock);
+ }
+}
+
+static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+
+ return wb_assign_nr(bdi, wb);
+}
+
+static struct bdi_writeback *bdi_new_wb(struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb;
+
+ /*
+ * Default bdi->wb is already assigned, so just return it
+ */
+ if (!test_and_set_bit(BDI_wb_alloc, &bdi->state))
+ wb = &bdi->wb;
+ else {
+ wb = kmalloc(sizeof(struct bdi_writeback), GFP_KERNEL);
+ if (wb) {
+ if (bdi_wb_init(wb, bdi)) {
+ kfree(wb);
+ wb = NULL;
+ }
+ }
+ }
+
+ return wb;
+}
+
+static void bdi_task_init(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct task_struct *tsk = current;
+ int was_empty;
+
+ /*
+ * Add us to the active bdi_list. If we are adding threads beyond
+ * the default embedded bdi_writeback, then we need to start using
+ * proper locking. Check the list for empty first, then set the
+ * BDI_wblist_lock flag if there's > 1 entry on the list now
+ */
+ spin_lock(&bdi->wb_lock);
+
+ was_empty = list_empty(&bdi->wb_list);
+ list_add_tail_rcu(&wb->list, &bdi->wb_list);
+ if (!was_empty)
+ set_bit(BDI_wblist_lock, &bdi->state);
+
+ spin_unlock(&bdi->wb_lock);
+
+ tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(tsk, 0);
+}
+
+static int bdi_start_fn(void *ptr)
+{
+ struct bdi_writeback *wb = ptr;
+ struct backing_dev_info *bdi = wb->bdi;
+ int ret;
+
+ /*
+ * Add us to the active bdi_list
+ */
+ spin_lock(&bdi_lock);
+ list_add(&bdi->bdi_list, &bdi_list);
+ spin_unlock(&bdi_lock);
+
+ bdi_task_init(bdi, wb);
+
+ /*
+ * Clear pending bit and wakeup anybody waiting to tear us down
+ */
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
+
+ ret = bdi_writeback_task(wb);
+
+ /*
+ * Remove us from the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&wb->list);
+ spin_unlock(&bdi->wb_lock);
+
+ /*
+ * wait for rcu grace period to end, so we can free wb
+ */
+ synchronize_srcu(&bdi->srcu);
+
+ /*
+ * Flush any work that raced with us exiting. No new work
+ * will be added, since this bdi isn't discoverable anymore.
+ */
+ if (!list_empty(&bdi->work_list))
+ wb_do_writeback(wb, 1);
+
+ wb->task = NULL;
+ bdi_put_wb(bdi, wb);
+ return ret;
+}
+
+int bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb;
+ int ret = 0;
+
+ if (!bdi_wblist_needs_lock(bdi))
+ ret = wb_has_dirty_io(&bdi->wb);
+ else {
+ int idx;
+
+ idx = srcu_read_lock(&bdi->srcu);
+
+ list_for_each_entry_rcu(wb, &bdi->wb_list, list) {
+ ret = wb_has_dirty_io(wb);
+ if (ret)
+ break;
+ }
+
+ srcu_read_unlock(&bdi->srcu, idx);
+ }
+
+ return ret;
+}
+
+static void bdi_flush_io(struct backing_dev_info *bdi)
+{
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .range_cyclic = 1,
+ .nr_to_write = 1024,
+ };
+
+ generic_sync_bdi_inodes(NULL, &wbc);
+}
+
+/*
+ * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * or we risk deadlocking on ->s_umount. The longer term solution would be
+ * to implement sync_supers_bdi() or similar and simply do it from the
+ * bdi writeback tasks individually.
+ */
+static int bdi_sync_supers(void *unused)
+{
+ set_user_nice(current, 0);
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+
+ /*
+ * Do this periodically, like kupdated() did before.
+ */
+ sync_supers();
+ }
+
+ return 0;
+}
+
+static void arm_supers_timer(void)
+{
+ unsigned long next;
+
+ next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
+ mod_timer(&sync_supers_timer, round_jiffies_up(next));
+}
+
+static void sync_supers_timer_fn(unsigned long unused)
+{
+ wake_up_process(sync_supers_tsk);
+ arm_supers_timer();
+}
+
+static int bdi_forker_task(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
+
+ bdi_task_init(me->bdi, me);
+
+ for (;;) {
+ struct backing_dev_info *bdi, *tmp;
+ struct bdi_writeback *wb;
+
+ /*
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
+ */
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+ wb_do_writeback(me, 0);
+
+ spin_lock(&bdi_lock);
+
+ /*
+ * Check if any existing bdi's have dirty data without
+ * a thread registered. If so, set that up.
+ */
+ list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+ if (bdi->wb.task)
+ continue;
+ if (list_empty(&bdi->work_list) &&
+ !bdi_has_dirty_io(bdi))
+ continue;
+
+ bdi_add_default_flusher_task(bdi);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (list_empty(&bdi_pending_list)) {
+ unsigned long wait;
+
+ spin_unlock(&bdi_lock);
+ wait = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout(wait);
+ try_to_freeze();
+ continue;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * This is our real job - check for pending entries in
+ * bdi_pending_list, and create the tasks that got added
+ */
+ bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
+ bdi_list);
+ list_del_init(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+
+ wb = bdi_new_wb(bdi);
+ if (!wb)
+ goto readd_flush;
+
+ wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
+ dev_name(bdi->dev));
+ /*
+ * If task creation fails, then readd the bdi to
+ * the pending list and force writeout of the bdi
+ * from this forker thread. That will free some memory
+ * and we can try again.
+ */
+ if (IS_ERR(wb->task)) {
+ wb->task = NULL;
+ bdi_put_wb(bdi, wb);
+readd_flush:
+ /*
+ * Add this 'bdi' to the back, so we get
+ * a chance to flush other bdi's to free
+ * memory.
+ */
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+ spin_unlock(&bdi_lock);
+
+ bdi_flush_io(bdi);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * bdi_lock held on entry
+ */
+static void bdi_add_one_flusher_task(struct backing_dev_info *bdi,
+ int(*func)(struct backing_dev_info *))
+{
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
+ printk("bdi %p/%s is not registered!\n", bdi, bdi->name);
+ return;
+ }
+
+ /*
+ * Check with the helper whether to proceed adding a task. Will only
+ * abort if we two or more simultanous calls to
+ * bdi_add_default_flusher_task() occured, further additions will block
+ * waiting for previous additions to finish.
+ */
+ if (!func(bdi)) {
+ list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+
+ /*
+ * We are now on the pending list, wake up bdi_forker_task()
+ * to finish the job and add us back to the active bdi_list
+ */
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+}
+
+static int flusher_add_helper_block(struct backing_dev_info *bdi)
+{
+ spin_unlock(&bdi_lock);
+ wait_on_bit_lock(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&bdi_lock);
+ return 0;
+}
+
+static int flusher_add_helper_test(struct backing_dev_info *bdi)
+{
+ return test_and_set_bit(BDI_pending, &bdi->state);
+}
+
+/*
+ * Add the default flusher task that gets created for any bdi
+ * that has dirty data pending writeout
+ */
+void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+{
+ bdi_add_one_flusher_task(bdi, flusher_add_helper_test);
+}
+
+/**
+ * bdi_add_flusher_task - add one more flusher task to this @bdi
+ * @bdi: the bdi
+ *
+ * Add an additional flusher task to this @bdi. Will block waiting on
+ * previous additions, if any.
+ *
+ */
+void bdi_add_flusher_task(struct backing_dev_info *bdi)
+{
+ spin_lock(&bdi_lock);
+ bdi_add_one_flusher_task(bdi, flusher_add_helper_block);
+ spin_unlock(&bdi_lock);
+}
+EXPORT_SYMBOL(bdi_add_flusher_task);
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -211,9 +648,42 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_list);
+ spin_unlock(&bdi_lock);
+
bdi->dev = dev;
- bdi_debug_register(bdi, dev_name(dev));
+ /*
+ * Just start the forker thread for our default backing_dev_info,
+ * and add other bdi's to the list. They will get a thread created
+ * on-demand when they need it.
+ */
+ if (bdi_cap_flush_forker(bdi)) {
+ struct bdi_writeback *wb;
+
+ wb = bdi_new_wb(bdi);
+ if (!wb) {
+ ret = -ENOMEM;
+ goto remove_err;
+ }
+
+ wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ dev_name(dev));
+ if (IS_ERR(wb->task)) {
+ wb->task = NULL;
+ bdi_put_wb(bdi, wb);
+ ret = -ENOMEM;
+remove_err:
+ spin_lock(&bdi_lock);
+ list_del(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+ goto exit;
+ }
+ }
+
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(BDI_registered, &bdi->state);
exit:
return ret;
}
@@ -225,9 +695,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb;
+
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ /*
+ * If setup is pending, wait for that to complete first
+ */
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * Make sure nobody finds us on the bdi_list anymore
+ */
+ spin_lock(&bdi_lock);
+ list_del(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+
+ /*
+ * Finally, kill the kernel threads. We don't need to be RCU
+ * safe anymore, since the bdi is gone from visibility.
+ */
+ list_for_each_entry(wb, &bdi->wb_list, list)
+ kthread_stop(wb->task);
+}
+
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ if (!bdi_cap_flush_forker(bdi))
+ bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
bdi->dev = NULL;
@@ -237,14 +740,21 @@ EXPORT_SYMBOL(bdi_unregister);
int bdi_init(struct backing_dev_info *bdi)
{
- int i;
- int err;
+ int i, err;
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
+ spin_lock_init(&bdi->wb_lock);
+ bdi->wb_mask = 0;
+ bdi->wb_cnt = 0;
+ INIT_LIST_HEAD(&bdi->bdi_list);
+ INIT_LIST_HEAD(&bdi->wb_list);
+ INIT_LIST_HEAD(&bdi->work_list);
+
+ bdi_wb_init(&bdi->wb, bdi);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -252,10 +762,15 @@ int bdi_init(struct backing_dev_info *bdi)
goto err;
}
+ err = init_srcu_struct(&bdi->srcu);
+ if (err)
+ goto err;
+
bdi->dirty_exceeded = 0;
err = prop_local_init_percpu(&bdi->completions);
if (err) {
+ cleanup_srcu_struct(&bdi->srcu);
err:
while (i--)
percpu_counter_destroy(&bdi->bdi_stat[i]);
@@ -269,8 +784,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
+ WARN_ON(bdi_has_dirty_io(bdi));
+
bdi_unregister(bdi);
+ cleanup_srcu_struct(&bdi->srcu);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..03a9c8b760d3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
return 1UL << (hstate->order + PAGE_SHIFT);
}
+EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
/*
* Return the page size being used by the MMU to back a VMA. In the majority
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f523..177a5169bbde 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
};
static LIST_HEAD(test_list);
-static DEFINE_PER_CPU(void *, test_pointer);
+static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
/*
* Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
}
for_each_possible_cpu(i) {
- per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
+ per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
pr_info("kmemleak: kmalloc(129) = %p\n",
- per_cpu(test_pointer, i));
+ per_cpu(kmemleak_test_pointer, i));
}
return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..3d0948234c7d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
#include <linux/pagevec.h>
/*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation. We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES 1024
-
-/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
-
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
*
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
/*
*
*/
-static DEFINE_SPINLOCK(bdi_lock);
static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
- unsigned long flags;
- spin_lock_irqsave(&bdi_lock, flags);
+ spin_lock(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
ret = -EINVAL;
}
}
- spin_unlock_irqrestore(&bdi_lock, flags);
+ spin_unlock(&bdi_lock);
return ret;
}
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
- unsigned long flags;
int ret = 0;
if (max_ratio > 100)
return -EINVAL;
- spin_lock_irqsave(&bdi_lock, flags);
+ spin_lock(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
- spin_unlock_irqrestore(&bdi_lock, flags);
+ spin_unlock(&bdi_lock);
return ret;
}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
* up.
*/
if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes(&wbc);
+ generic_sync_bdi_inodes(NULL, &wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
@@ -597,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping)
(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS)
> background_thresh)))
- pdflush_operation(background_writeout, 0);
+ bdi_start_writeback(bdi, NULL, 0, WB_SYNC_NONE);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -610,6 +596,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
+static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
@@ -627,7 +615,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
@@ -640,7 +627,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
* tasks in balance_dirty_pages(). Period.
*/
preempt_disable();
- p = &__get_cpu_var(ratelimits);
+ p = &__get_cpu_var(bdp_ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
*p = 0;
@@ -682,152 +669,53 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world.
*/
-static void background_writeout(unsigned long _min_pages)
+void wakeup_flusher_threads(long nr_pages)
{
- long min_pages = _min_pages;
struct writeback_control wbc = {
- .bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
- .nr_to_write = 0,
- .nonblocking = 1,
.range_cyclic = 1,
};
- for ( ; ; ) {
- unsigned long background_thresh;
- unsigned long dirty_thresh;
-
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
- && min_pages <= 0)
- break;
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- wbc.pages_skipped = 0;
- writeback_inodes(&wbc);
- min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- /* Wrote less than expected */
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- else
- break;
- }
- }
-}
-
-/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
+ wbc.nr_to_write = nr_pages;
+ bdi_writeback_all(NULL, &wbc);
}
-static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space. So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval. But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write. So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
- unsigned long oldest_jif;
- unsigned long start_jif;
- unsigned long next_jif;
- long nr_to_write;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = &oldest_jif,
- .nr_to_write = 0,
- .nonblocking = 1,
- .for_kupdate = 1,
- .range_cyclic = 1,
- };
-
- sync_supers();
-
- oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
- start_jif = jiffies;
- next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
- nr_to_write = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- while (nr_to_write > 0) {
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_inodes(&wbc);
- if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- else
- break; /* All the old data is written */
- }
- nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- }
- if (time_before(next_jif, jiffies + HZ))
- next_jif = jiffies + HZ;
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, next_jif);
-}
-
-/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, jiffies +
- msecs_to_jiffies(dirty_writeback_interval * 10));
- else
- del_timer(&wb_timer);
return 0;
}
-static void wb_timer_fn(unsigned long unused)
+static void do_laptop_sync(struct work_struct *work)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
- mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
-}
-
-static void laptop_flush(unsigned long unused)
-{
- sys_sync();
+ wakeup_flusher_threads(0);
+ kfree(work);
}
static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ struct work_struct *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(work, do_laptop_sync);
+ schedule_work(work);
+ }
}
/*
@@ -910,8 +798,6 @@ void __init page_writeback_init(void)
{
int shift;
- mod_timer(&wb_timer,
- jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * mm/pdflush.c - worker threads for writing back filesystem data
- *
- * Copyright (C) 2002, Linus Torvalds.
- *
- * 09Apr2002 Andrew Morton
- * Initial version
- * 29Feb2004 kaos@sgi.com
- * Move worker thread creation to kthread to avoid chewing
- * up stack space with nested calls to kernel_thread.
- */
-
-#include <linux/sched.h>
-#include <linux/list.h>
-#include <linux/signal.h>
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h> /* Needed by writeback.h */
-#include <linux/writeback.h> /* Prototypes pdflush_operation() */
-#include <linux/kthread.h>
-#include <linux/cpuset.h>
-#include <linux/freezer.h>
-
-
-/*
- * Minimum and maximum number of pdflush instances
- */
-#define MIN_PDFLUSH_THREADS 2
-#define MAX_PDFLUSH_THREADS 8
-
-static void start_one_pdflush_thread(void);
-
-
-/*
- * The pdflush threads are worker threads for writing back dirty data.
- * Ideally, we'd like one thread per active disk spindle. But the disk
- * topology is very hard to divine at this level. Instead, we take
- * care in various places to prevent more than one pdflush thread from
- * performing writeback against a single filesystem. pdflush threads
- * have the PF_FLUSHER flag set in current->flags to aid in this.
- */
-
-/*
- * All the pdflush threads. Protected by pdflush_lock
- */
-static LIST_HEAD(pdflush_list);
-static DEFINE_SPINLOCK(pdflush_lock);
-
-/*
- * The count of currently-running pdflush threads. Protected
- * by pdflush_lock.
- *
- * Readable by sysctl, but not writable. Published to userspace at
- * /proc/sys/vm/nr_pdflush_threads.
- */
-int nr_pdflush_threads = 0;
-
-/*
- * The time at which the pdflush thread pool last went empty
- */
-static unsigned long last_empty_jifs;
-
-/*
- * The pdflush thread.
- *
- * Thread pool management algorithm:
- *
- * - The minimum and maximum number of pdflush instances are bound
- * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
- *
- * - If there have been no idle pdflush instances for 1 second, create
- * a new one.
- *
- * - If the least-recently-went-to-sleep pdflush thread has been asleep
- * for more than one second, terminate a thread.
- */
-
-/*
- * A structure for passing work to a pdflush thread. Also for passing
- * state information between pdflush threads. Protected by pdflush_lock.
- */
-struct pdflush_work {
- struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
- unsigned long arg0; /* An argument to the callback */
- struct list_head list; /* On pdflush_list, when idle */
- unsigned long when_i_went_to_sleep;
-};
-
-static int __pdflush(struct pdflush_work *my_work)
-{
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
- my_work->fn = NULL;
- my_work->who = current;
- INIT_LIST_HEAD(&my_work->list);
-
- spin_lock_irq(&pdflush_lock);
- for ( ; ; ) {
- struct pdflush_work *pdf;
-
- set_current_state(TASK_INTERRUPTIBLE);
- list_move(&my_work->list, &pdflush_list);
- my_work->when_i_went_to_sleep = jiffies;
- spin_unlock_irq(&pdflush_lock);
- schedule();
- try_to_freeze();
- spin_lock_irq(&pdflush_lock);
- if (!list_empty(&my_work->list)) {
- /*
- * Someone woke us up, but without removing our control
- * structure from the global list. swsusp will do this
- * in try_to_freeze()->refrigerator(). Handle it.
- */
- my_work->fn = NULL;
- continue;
- }
- if (my_work->fn == NULL) {
- printk("pdflush: bogus wakeup\n");
- continue;
- }
- spin_unlock_irq(&pdflush_lock);
-
- (*my_work->fn)(my_work->arg0);
-
- spin_lock_irq(&pdflush_lock);
-
- /*
- * Thread creation: For how long have there been zero
- * available threads?
- *
- * To throttle creation, we reset last_empty_jifs.
- */
- if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
- if (list_empty(&pdflush_list)) {
- if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
- last_empty_jifs = jiffies;
- nr_pdflush_threads++;
- spin_unlock_irq(&pdflush_lock);
- start_one_pdflush_thread();
- spin_lock_irq(&pdflush_lock);
- }
- }
- }
-
- my_work->fn = NULL;
-
- /*
- * Thread destruction: For how long has the sleepiest
- * thread slept?
- */
- if (list_empty(&pdflush_list))
- continue;
- if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
- continue;
- pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
- if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
- /* Limit exit rate */
- pdf->when_i_went_to_sleep = jiffies;
- break; /* exeunt */
- }
- }
- nr_pdflush_threads--;
- spin_unlock_irq(&pdflush_lock);
- return 0;
-}
-
-/*
- * Of course, my_work wants to be just a local in __pdflush(). It is
- * separated out in this manner to hopefully prevent the compiler from
- * performing unfortunate optimisations against the auto variables. Because
- * these are visible to other tasks and CPUs. (No problem has actually
- * been observed. This is just paranoia).
- */
-static int pdflush(void *dummy)
-{
- struct pdflush_work my_work;
- cpumask_var_t cpus_allowed;
-
- /*
- * Since the caller doesn't even check kthread_run() worked, let's not
- * freak out too much if this fails.
- */
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
- return 0;
- }
-
- /*
- * pdflush can spend a lot of time doing encryption via dm-crypt. We
- * don't want to do that at keventd's priority.
- */
- set_user_nice(current, 0);
-
- /*
- * Some configs put our parent kthread in a limited cpuset,
- * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
- * Our needs are more modest - cut back to our cpusets cpus_allowed.
- * This is needed as pdflush's are dynamically created and destroyed.
- * The boottime pdflush's are easily placed w/o these 2 lines.
- */
- cpuset_cpus_allowed(current, cpus_allowed);
- set_cpus_allowed_ptr(current, cpus_allowed);
- free_cpumask_var(cpus_allowed);
-
- return __pdflush(&my_work);
-}
-
-/*
- * Attempt to wake up a pdflush thread, and get it to do some work for you.
- * Returns zero if it indeed managed to find a worker thread, and passed your
- * payload to it.
- */
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
-{
- unsigned long flags;
- int ret = 0;
-
- BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
-
- spin_lock_irqsave(&pdflush_lock, flags);
- if (list_empty(&pdflush_list)) {
- ret = -1;
- } else {
- struct pdflush_work *pdf;
-
- pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
- list_del_init(&pdf->list);
- if (list_empty(&pdflush_list))
- last_empty_jifs = jiffies;
- pdf->fn = fn;
- pdf->arg0 = arg0;
- wake_up_process(pdf->who);
- }
- spin_unlock_irqrestore(&pdflush_lock, flags);
-
- return ret;
-}
-
-static void start_one_pdflush_thread(void)
-{
- struct task_struct *k;
-
- k = kthread_run(pdflush, NULL, "pdflush");
- if (unlikely(IS_ERR(k))) {
- spin_lock_irq(&pdflush_lock);
- nr_pdflush_threads--;
- spin_unlock_irq(&pdflush_lock);
- }
-}
-
-static int __init pdflush_init(void)
-{
- int i;
-
- /*
- * Pre-set nr_pdflush_threads... If we fail to create,
- * the count will be decremented.
- */
- nr_pdflush_threads = MIN_PDFLUSH_THREADS;
-
- for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
- start_one_pdflush_thread();
- return 0;
-}
-
-module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd8853..b3d0bcff8c7c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running). Unit grows as
- * necessary and all units grow or shrink in unison. When a chunk is
- * filled up, another chunk is allocated. ie. in vmalloc area
+ * chunk is consisted of boot-time determined number of units and the
+ * first chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated. ie. in
+ * vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
- * percpu base registers pcpu_unit_size apart.
+ * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
+ * cpus. On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
*
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes. The allocator organizes chunks into lists
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
*
* To use this allocator, arch code should do the followings.
*
- * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
@@ -56,6 +59,7 @@
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/list.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -94,20 +98,27 @@ struct pcpu_chunk {
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
- struct page **page; /* points to page array */
- struct page *page_ar[]; /* #cpus * UNIT_PAGES */
+ unsigned long populated[]; /* populated bitmap */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
+/* cpus with the lowest and highest unit numbers */
+static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_last_unit_cpu __read_mostly;
+
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
+/* cpu -> unit map */
+const int *pcpu_unit_map __read_mostly;
+
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
@@ -129,9 +140,9 @@ static int pcpu_reserved_chunk_limit;
* Synchronization rules.
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * The latter is a spinlock and protects the index data structures -
- * chunk slots, chunks and area maps in chunks.
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping. The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,13 +189,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
- return cpu * pcpu_unit_pages + page_idx;
-}
-
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
@@ -194,10 +199,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
}
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
- int page_idx)
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
{
- return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}
/* set the pointer to a chunk in a page struct */
@@ -212,6 +220,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
return (struct pcpu_chunk *)page->index;
}
+static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_zero_bit(chunk->populated, end, *rs);
+ *re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_bit(chunk->populated, end, *rs);
+ *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators. Iterate over (un)populated
+ * page regions betwen @start and @end in @chunk. @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
/**
* pcpu_mem_alloc - allocate memory
* @size: bytes to allocate
@@ -290,13 +326,21 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
void *first_start = pcpu_first_chunk->vm->addr;
/* is it in the first chunk? */
- if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+ if (addr >= first_start && addr < first_start + pcpu_unit_size) {
/* is it in the reserved area? */
if (addr < first_start + pcpu_reserved_chunk_limit)
return pcpu_reserved_chunk;
return pcpu_first_chunk;
}
+ /*
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
+ */
+ addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
return pcpu_get_page_chunk(vmalloc_to_page(addr));
}
@@ -545,125 +589,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
}
/**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- * @flush_tlb: whether to flush tlb or not
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
*
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
*/
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
- bool flush_tlb)
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
{
- unsigned int last = num_possible_cpus() - 1;
- unsigned int cpu;
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_alloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_alloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
- /* unmap must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ memset(pages, 0, pages_size);
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
- /*
- * Each flushing trial can be very expensive, issue flush on
- * the whole region at once rather than doing it for each cpu.
- * This could be an overkill but is more scalable.
- */
- flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+ *bitmapp = bitmap;
+ return pages;
+}
- for_each_possible_cpu(cpu)
- unmap_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT);
-
- /* ditto as flush_cache_vunmap() */
- if (flush_tlb)
- flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
}
/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
*/
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
- bool flush)
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int unmap_start = -1;
- int uninitialized_var(unmap_end);
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
- if (!*pagep)
- continue;
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
- __free_page(*pagep);
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
- /*
- * If it's partial depopulation, it might get
- * populated or depopulated again. Mark the
- * page gone.
- */
- *pagep = NULL;
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
- unmap_start = unmap_start < 0 ? i : unmap_start;
- unmap_end = i + 1;
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
}
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
}
- if (unmap_start >= 0)
- pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+ for (i = page_start; i < page_end; i++)
+ __clear_bit(i, populated);
}
/**
- * pcpu_map - map pages into a pcpu_chunk
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
+}
+
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
*/
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- unsigned int last = num_possible_cpus() - 1;
- unsigned int cpu;
- int err;
-
- /* map must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ unsigned int cpu, tcpu;
+ int i, err;
for_each_possible_cpu(cpu) {
- err = map_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT,
- PAGE_KERNEL,
- pcpu_chunk_pagep(chunk, cpu, page_start));
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
if (err < 0)
- return err;
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
}
- /* flush at once, please read comments in pcpu_unmap() */
- flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ return;
+ break;
+ }
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
}
/**
@@ -680,50 +926,60 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
- const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
- int map_start = -1;
- int uninitialized_var(map_end);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
unsigned int cpu;
- int i;
+ int rs, re, rc;
- for (i = page_start; i < page_end; i++) {
- if (pcpu_chunk_page_occupied(chunk, i)) {
- if (map_start >= 0) {
- if (pcpu_map(chunk, map_start, map_end))
- goto err;
- map_start = -1;
- }
- continue;
- }
+ /* quick path, check whether all pages are already there */
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ goto clear;
+ break;
+ }
- map_start = map_start < 0 ? i : map_start;
- map_end = i + 1;
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
- *pagep = alloc_pages_node(cpu_to_node(cpu),
- alloc_mask, 0);
- if (!*pagep)
- goto err;
- pcpu_set_page_chunk(*pagep, chunk);
- }
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
}
- if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
- goto err;
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
for_each_possible_cpu(cpu)
- memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
- size);
-
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
return 0;
-err:
- /* likely under heavy memory pressure, give memory back */
- pcpu_depopulate_chunk(chunk, off, size, true);
- return -ENOMEM;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@ -747,7 +1003,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
- chunk->page = chunk->page_ar;
chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
if (!chunk->vm) {
@@ -847,6 +1102,7 @@ area_found:
mutex_unlock(&pcpu_alloc_mutex);
+ /* return address relative to unit0 */
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
fail_unlock:
@@ -928,7 +1184,7 @@ static void pcpu_reclaim(struct work_struct *work)
mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) {
- pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+ pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
}
@@ -976,26 +1232,16 @@ EXPORT_SYMBOL_GPL(free_percpu);
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
- * @base_addr: mapped address, NULL for auto
- * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @base_addr: mapped address
+ * @unit_map: cpu -> unit map, NULL for sequential mapping
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
- * setup path. The first two parameters are mandatory. The rest are
- * optional.
- *
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number. It should at least return enough pages to
- * cover the static area. The returned pages for static area should
- * have been initialized with valid data. If @unit_size is specified,
- * it can also return pages after the static area. NULL return
- * indicates end of pages for the cpu. Note that @get_page_fn() must
- * return the same number of pages for all cpus.
+ * setup path.
*
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
@@ -1010,17 +1256,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
* non-negative value makes percpu leave alone the area beyond
* @static_size + @reserved_size + @dyn_size.
*
- * @unit_size, if non-negative, specifies unit size and must be
- * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + if non-negative, @dyn_size.
- *
- * Non-null @base_addr means that the caller already allocated virtual
- * region for the first chunk and mapped it. percpu must not mess
- * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
- * @populate_pte_fn doesn't make any sense.
+ * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
+ * equal to or larger than @static_size + @reserved_size + if
+ * non-negative, @dyn_size.
*
- * @populate_pte_fn is used to populate the pagetable. NULL means the
- * caller already populated the pagetable.
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
*
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
@@ -1033,47 +1274,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size, size_t reserved_size,
- ssize_t dyn_size, ssize_t unit_size,
- void *base_addr,
- pcpu_populate_pte_fn_t populate_pte_fn)
+size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
+ ssize_t dyn_size, size_t unit_size,
+ void *base_addr, const int *unit_map)
{
static struct vm_struct first_vm;
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0);
struct pcpu_chunk *schunk, *dchunk = NULL;
- unsigned int cpu;
- int nr_pages;
- int err, i;
+ unsigned int cpu, tcpu;
+ int i;
- /* santiy checks */
+ /* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
- if (unit_size >= 0) {
- BUG_ON(unit_size < size_sum);
- BUG_ON(unit_size & ~PAGE_MASK);
- BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
- } else
- BUG_ON(base_addr);
- BUG_ON(base_addr && populate_pte_fn);
-
- if (unit_size >= 0)
- pcpu_unit_pages = unit_size >> PAGE_SHIFT;
- else
- pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- PFN_UP(size_sum));
+ BUG_ON(!base_addr);
+ BUG_ON(unit_size < size_sum);
+ BUG_ON(unit_size & ~PAGE_MASK);
+ BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+
+ /* determine number of units and verify and initialize pcpu_unit_map */
+ if (unit_map) {
+ int first_unit = INT_MAX, last_unit = INT_MIN;
+
+ for_each_possible_cpu(cpu) {
+ int unit = unit_map[cpu];
+
+ BUG_ON(unit < 0);
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ /* the mapping should be one-to-one */
+ BUG_ON(unit_map[tcpu] == unit);
+ }
+
+ if (unit < first_unit) {
+ pcpu_first_unit_cpu = cpu;
+ first_unit = unit;
+ }
+ if (unit > last_unit) {
+ pcpu_last_unit_cpu = cpu;
+ last_unit = unit;
+ }
+ }
+ pcpu_nr_units = last_unit + 1;
+ pcpu_unit_map = unit_map;
+ } else {
+ int *identity_map;
+
+ /* #units == #cpus, identity mapped */
+ identity_map = alloc_bootmem(num_possible_cpus() *
+ sizeof(identity_map[0]));
+ for_each_possible_cpu(cpu)
+ identity_map[cpu] = cpu;
+
+ pcpu_first_unit_cpu = 0;
+ pcpu_last_unit_cpu = pcpu_nr_units - 1;
+ pcpu_nr_units = num_possible_cpus();
+ pcpu_unit_map = identity_map;
+ }
+
+ /* determine basic parameters */
+ pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+ pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
+ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
+ first_vm.flags = VM_ALLOC;
+ first_vm.size = pcpu_chunk_size;
+ first_vm.addr = base_addr;
+
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
@@ -1095,7 +1372,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->page = schunk->page_ar;
+ schunk->immutable = true;
+ bitmap_fill(schunk->populated, pcpu_unit_pages);
if (reserved_size) {
schunk->free_size = reserved_size;
@@ -1113,93 +1391,39 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+ dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->page = schunk->page_ar; /* share page map with schunk */
+ dchunk->immutable = true;
+ bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
- /* allocate vm address */
- first_vm.flags = VM_ALLOC;
- first_vm.size = pcpu_chunk_size;
-
- if (!base_addr)
- vm_area_register_early(&first_vm, PAGE_SIZE);
- else {
- /*
- * Pages already mapped. No need to remap into
- * vmalloc area. In this case the first chunks can't
- * be mapped or unmapped by percpu and are marked
- * immutable.
- */
- first_vm.addr = base_addr;
- schunk->immutable = true;
- if (dchunk)
- dchunk->immutable = true;
- }
-
- /* assign pages */
- nr_pages = -1;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < pcpu_unit_pages; i++) {
- struct page *page = get_page_fn(cpu, i);
-
- if (!page)
- break;
- *pcpu_chunk_pagep(schunk, cpu, i) = page;
- }
-
- BUG_ON(i < PFN_UP(static_size));
-
- if (nr_pages < 0)
- nr_pages = i;
- else
- BUG_ON(nr_pages != i);
- }
-
- /* map them */
- if (populate_pte_fn) {
- for_each_possible_cpu(cpu)
- for (i = 0; i < nr_pages; i++)
- populate_pte_fn(pcpu_chunk_addr(schunk,
- cpu, i));
-
- err = pcpu_map(schunk, 0, nr_pages);
- if (err)
- panic("failed to setup static percpu area, err=%d\n",
- err);
- }
-
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
- pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
+ pcpu_base_addr = schunk->vm->addr;
return pcpu_unit_size;
}
-/*
- * Embedding first chunk setup helper.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
+ ssize_t *dyn_sizep)
{
- size_t off = (size_t)pageno << PAGE_SHIFT;
+ size_t size_sum;
- if (off >= pcpue_size)
- return NULL;
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+ if (*dyn_sizep != 0)
+ *dyn_sizep = size_sum - static_size - reserved_size;
- return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+ return size_sum;
}
/**
@@ -1207,7 +1431,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
@@ -1219,9 +1442,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment. Also, when @dyn_size is auto,
- * @dyn_size does not fill the whole first chunk but only what's
- * necessary for page alignment after static and reserved areas.
+ * specified to fill page alignment. When @dyn_size is auto,
+ * @dyn_size is just big enough to fill page alignment after static
+ * and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
@@ -1231,28 +1454,21 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
- ssize_t dyn_size, ssize_t unit_size)
+ ssize_t dyn_size)
{
- size_t chunk_size;
+ size_t size_sum, unit_size, chunk_size;
+ void *base;
unsigned int cpu;
/* determine parameters and allocate */
- pcpue_size = PFN_ALIGN(static_size + reserved_size +
- (dyn_size >= 0 ? dyn_size : 0));
- if (dyn_size != 0)
- dyn_size = pcpue_size - static_size - reserved_size;
-
- if (unit_size >= 0) {
- BUG_ON(unit_size < pcpue_size);
- pcpue_unit_size = unit_size;
- } else
- pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-
- chunk_size = pcpue_unit_size * num_possible_cpus();
-
- pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
- if (!pcpue_ptr) {
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+
+ unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ chunk_size = unit_size * num_possible_cpus();
+
+ base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
return -ENOMEM;
@@ -1260,18 +1476,540 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
/* return the leftover and copy */
for_each_possible_cpu(cpu) {
- void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+ void *ptr = base + cpu * unit_size;
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
+ free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
memcpy(ptr, __per_cpu_load, static_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ size_sum >> PAGE_SHIFT, base, static_size);
+
+ return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, base, NULL);
+}
+
+/**
+ * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+ static struct vm_struct vm;
+ int unit_pages;
+ size_t pages_size;
+ struct page **pages;
+ unsigned int cpu;
+ int i, j;
+ ssize_t ret;
+
+ unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+ PCPU_MIN_UNIT_SIZE));
+
+ /* unaligned allocations can't be freed, round up to page size */
+ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+ sizeof(pages[0]));
+ pages = alloc_bootmem(pages_size);
+
+ /* allocate pages */
+ j = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < unit_pages; i++) {
+ void *ptr;
+
+ ptr = alloc_fn(cpu, PAGE_SIZE);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate "
+ "4k page for cpu%u\n", cpu);
+ goto enomem;
+ }
+ pages[j++] = virt_to_page(ptr);
+ }
+
+ /* allocate vm area, map the pages and copy static data */
+ vm.flags = VM_ALLOC;
+ vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
+ vm_area_register_early(&vm, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long unit_addr = (unsigned long)vm.addr +
+ (cpu * unit_pages << PAGE_SHIFT);
+
+ for (i = 0; i < unit_pages; i++)
+ populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+
+ /* pte already populated, the following shouldn't fail */
+ ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+ unit_pages);
+ if (ret < 0)
+ panic("failed to map percpu area, err=%zd\n", ret);
+
+ /*
+ * FIXME: Archs with virtual cache should flush local
+ * cache for the linear mapping here - something
+ * equivalent to flush_cache_vmap() on the local cpu.
+ * flush_cache_vmap() can't be used as most supporting
+ * data structures are not set up yet.
+ */
+
+ /* copy static data */
+ memcpy((void *)unit_addr, __per_cpu_load, static_size);
+ }
+
+ /* we're ready, commit */
+ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
+ unit_pages, static_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+ unit_pages << PAGE_SHIFT, vm.addr, NULL);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_fn(page_address(pages[j]), PAGE_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pages), pages_size);
+ return ret;
+}
+
+/*
+ * Large page remapping first chunk setup helper
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page. The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated. -errno on failure.
+ */
+int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+ ssize_t *dyn_sizep, size_t *unit_sizep,
+ size_t lpage_size, int *unit_map,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ int group_cnt_max = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs;
+ unsigned int cpu, tcpu;
+ int group, unit;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of lpage_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, lpage_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group_cnt[group]; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 25%. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ *unit_sizep = alloc_size / best_upa;
- return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- reserved_size, dyn_size,
- pcpue_unit_size, pcpue_ptr, NULL);
+ /* assign units to cpus accordingly */
+ unit = 0;
+ for (group = 0; group_cnt[group]; group++) {
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ unit_map[cpu] = unit++;
+ unit = roundup(unit, best_upa);
+ }
+
+ return unit; /* unit contains aligned number of units */
+}
+
+struct pcpul_ent {
+ void *ptr;
+ void *map_addr;
+};
+
+static size_t pcpul_size;
+static size_t pcpul_lpage_size;
+static int pcpul_nr_lpages;
+static struct pcpul_ent *pcpul_map;
+
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+ unsigned int *cpup)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (unit_map[cpu] == unit) {
+ if (cpup)
+ *cpup = cpu;
+ return true;
+ }
+
+ return false;
+}
+
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+ size_t reserved_size, size_t dyn_size,
+ size_t unit_size, size_t lpage_size,
+ const int *unit_map, int nr_units)
+{
+ int width = 1, v = nr_units;
+ char empty_str[] = "--------";
+ int upl, lpl; /* units per lpage, lpage per line */
+ unsigned int cpu;
+ int lpage, unit;
+
+ while (v /= 10)
+ width++;
+ empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+ upl = max_t(int, lpage_size / unit_size, 1);
+ lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+ printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+ static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+ for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+ if (!(unit % upl)) {
+ if (!(lpage++ % lpl)) {
+ printk("\n");
+ printk("%spcpu-lpage: ", lvl);
+ } else
+ printk("| ");
+ }
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ printk("%0*d ", width, cpu);
+ else
+ printk("%s ", empty_str);
+ }
+ printk("\n");
+}
+
+/**
+ * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes
+ * @unit_size: unit size in bytes
+ * @lpage_size: the size of a large page
+ * @unit_map: cpu -> unit mapping
+ * @nr_units: the number of units
+ * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ * @free_fn: function to free percpu memory, @size <= lpage_size
+ * @map_fn: function to map percpu lpage, always called with lpage_size
+ *
+ * This allocator uses large page to build and map the first chunk.
+ * Unlike other helpers, the caller should always specify @dyn_size
+ * and @unit_size. These parameters along with @unit_map and
+ * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ * This two stage initialization is to allow arch code to evaluate the
+ * parameters before committing to it.
+ *
+ * Large pages are allocated as directed by @unit_map and other
+ * parameters and mapped to vmalloc space. Unused holes are returned
+ * to the page allocator. Note that these holes end up being actively
+ * mapped twice - once to the physical mapping and to the vmalloc area
+ * for the first percpu chunk. Depending on architecture, this might
+ * cause problem when changing page attributes of the returned area.
+ * These double mapped areas can be detected using
+ * pcpu_lpage_remapped().
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+ size_t dyn_size, size_t unit_size,
+ size_t lpage_size, const int *unit_map,
+ int nr_units,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_map_fn_t map_fn)
+{
+ static struct vm_struct vm;
+ size_t chunk_size = unit_size * nr_units;
+ size_t map_size;
+ unsigned int cpu;
+ ssize_t ret;
+ int i, j, unit;
+
+ pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+ unit_size, lpage_size, unit_map, nr_units);
+
+ BUG_ON(chunk_size % lpage_size);
+
+ pcpul_size = static_size + reserved_size + dyn_size;
+ pcpul_lpage_size = lpage_size;
+ pcpul_nr_lpages = chunk_size / lpage_size;
+
+ /* allocate pointer array and alloc large pages */
+ map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
+ pcpul_map = alloc_bootmem(map_size);
+
+ /* allocate all pages */
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ size_t offset = i * lpage_size;
+ int first_unit = offset / unit_size;
+ int last_unit = (offset + lpage_size - 1) / unit_size;
+ void *ptr;
+
+ /* find out which cpu is mapped to this unit */
+ for (unit = first_unit; unit <= last_unit; unit++)
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ goto found;
+ continue;
+ found:
+ ptr = alloc_fn(cpu, lpage_size);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate large page "
+ "for cpu%u\n", cpu);
+ goto enomem;
+ }
+
+ pcpul_map[i].ptr = ptr;
+ }
+
+ /* return unused holes */
+ for (unit = 0; unit < nr_units; unit++) {
+ size_t start = unit * unit_size;
+ size_t end = start + unit_size;
+ size_t off, next;
+
+ /* don't free used part of occupied unit */
+ if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+ start += pcpul_size;
+
+ /* unit can span more than one page, punch the holes */
+ for (off = start; off < end; off = next) {
+ void *ptr = pcpul_map[off / lpage_size].ptr;
+ next = min(roundup(off + 1, lpage_size), end);
+ if (ptr)
+ free_fn(ptr + off % lpage_size, next - off);
+ }
+ }
+
+ /* allocate address, map and copy */
+ vm.flags = VM_ALLOC;
+ vm.size = chunk_size;
+ vm_area_register_early(&vm, unit_size);
+
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ if (!pcpul_map[i].ptr)
+ continue;
+ pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+ map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+ }
+
+ for_each_possible_cpu(cpu)
+ memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+ static_size);
+
+ /* we're ready, commit */
+ pr_info("PERCPU: Remapped at %p with large pages, static data "
+ "%zu bytes\n", vm.addr, static_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, vm.addr, unit_map);
+
+ /*
+ * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
+ * lpages are pushed to the end and trimmed.
+ */
+ for (i = 0; i < pcpul_nr_lpages - 1; i++)
+ for (j = i + 1; j < pcpul_nr_lpages; j++) {
+ struct pcpul_ent tmp;
+
+ if (!pcpul_map[j].ptr)
+ continue;
+ if (pcpul_map[i].ptr &&
+ pcpul_map[i].ptr < pcpul_map[j].ptr)
+ continue;
+
+ tmp = pcpul_map[i];
+ pcpul_map[i] = pcpul_map[j];
+ pcpul_map[j] = tmp;
+ }
+
+ while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+ pcpul_nr_lpages--;
+
+ return ret;
+
+enomem:
+ for (i = 0; i < pcpul_nr_lpages; i++)
+ if (pcpul_map[i].ptr)
+ free_fn(pcpul_map[i].ptr, lpage_size);
+ free_bootmem(__pa(pcpul_map), map_size);
+ return -ENOMEM;
+}
+
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area. This is
+ * used by pageattr to detect VM aliases and break up the pcpu large
+ * page mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used large
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+ unsigned long lpage_mask = pcpul_lpage_size - 1;
+ void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+ unsigned long offset = (unsigned long)kaddr & lpage_mask;
+ int left = 0, right = pcpul_nr_lpages - 1;
+ int pos;
+
+ /* pcpul in use at all? */
+ if (!pcpul_map)
+ return NULL;
+
+ /* okay, perform binary search */
+ while (left <= right) {
+ pos = (left + right) / 2;
+
+ if (pcpul_map[pos].ptr < lpage_addr)
+ left = pos + 1;
+ else if (pcpul_map[pos].ptr > lpage_addr)
+ right = pos - 1;
+ else
+ return pcpul_map[pos].map_addr + offset;
+ }
+
+ return NULL;
+}
+#endif
+
+/*
+ * Generic percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup. This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location. As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+ size_t static_size = __per_cpu_end - __per_cpu_start;
+ ssize_t unit_size;
+ unsigned long delta;
+ unsigned int cpu;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE);
+ if (unit_size < 0)
+ panic("Failed to initialized percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + cpu * unit_size;
}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..6633965bb27b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
#include <linux/module.h>
#include <linux/quicklist.h>
-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
#define FRACTION_OF_NODE_MEM 16
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
int node = numa_node_id();
struct zone *zones = NODE_DATA(node)->node_zones;
int num_cpus_on_node;
- const struct cpumask *cpumask_on_node = cpumask_of_node(node);
node_free_pages =
#ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
max = node_free_pages / FRACTION_OF_NODE_MEM;
- num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+ num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
max /= num_cpus_on_node;
return max(max, min_pages);
diff --git a/mm/slqb.c b/mm/slqb.c
new file mode 100644
index 000000000000..b986604cab7a
--- /dev/null
+++ b/mm/slqb.c
@@ -0,0 +1,3765 @@
+/*
+ * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
+ * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
+ * and freeing, but with a secondary goal of good remote freeing (freeing on
+ * another CPU from that which allocated).
+ *
+ * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/ctype.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+
+/*
+ * TODO
+ * - fix up releasing of offlined data structures. Not a big deal because
+ * they don't get cumulatively leaked with successive online/offline cycles
+ * - allow OOM conditions to flush back per-CPU pages to common lists to be
+ * reused by other CPUs.
+ * - investiage performance with memoryless nodes. Perhaps CPUs can be given
+ * a default closest home node via which it can use fastpath functions.
+ * Perhaps it is not a big problem.
+ */
+
+/*
+ * slqb_page overloads struct page, and is used to manage some slob allocation
+ * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
+ * we'll just define our own struct slqb_page type variant here.
+ */
+struct slqb_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ unsigned int inuse; /* Nr of objects */
+ struct kmem_cache_list *list; /* Pointer to list */
+ void **freelist; /* LIFO freelist */
+ union {
+ struct list_head lru; /* misc. list */
+ struct rcu_head rcu_head; /* for rcu freeing */
+ };
+ };
+ struct page page;
+ };
+};
+static inline void struct_slqb_page_wrong_size(void)
+{ BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
+
+#define PG_SLQB_BIT (1 << PG_slab)
+
+/*
+ * slqb_min_order: minimum allocation order for slabs
+ */
+static int slqb_min_order;
+
+/*
+ * slqb_min_objects: minimum number of objects per slab. Increasing this
+ * will increase the allocation order for slabs with larger objects
+ */
+static int slqb_min_objects = 1;
+
+#ifdef CONFIG_NUMA
+static inline int slab_numa(struct kmem_cache *s)
+{
+ return s->flags & SLAB_NUMA;
+}
+#else
+static inline int slab_numa(struct kmem_cache *s)
+{
+ return 0;
+}
+#endif
+
+static inline int slab_hiwater(struct kmem_cache *s)
+{
+ return s->hiwater;
+}
+
+static inline int slab_freebatch(struct kmem_cache *s)
+{
+ return s->freebatch;
+}
+
+/*
+ * Lock order:
+ * kmem_cache_node->list_lock
+ * kmem_cache_remote_free->lock
+ *
+ * Data structures:
+ * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
+ *
+ * - A LIFO list of node-local objects. Allocation and freeing of node local
+ * objects goes first to this list.
+ *
+ * - 2 Lists of slab pages, free and partial pages. If an allocation misses
+ * the object list, it tries from the partial list, then the free list.
+ * After freeing an object to the object list, if it is over a watermark,
+ * some objects are freed back to pages. If an allocation misses these lists,
+ * a new slab page is allocated from the page allocator. If the free list
+ * reaches a watermark, some of its pages are returned to the page allocator.
+ *
+ * - A remote free queue, where objects freed that did not come from the local
+ * node are queued to. When this reaches a watermark, the objects are
+ * flushed.
+ *
+ * - A remotely freed queue, where objects allocated from this CPU are flushed
+ * to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
+ * used to protect access to this queue.
+ *
+ * When the remotely freed queue reaches a watermark, a flag is set to tell
+ * the owner CPU to check it. The owner CPU will then check the queue on the
+ * next allocation that misses the object list. It will move all objects from
+ * this list onto the object list and then allocate one.
+ *
+ * This system of remote queueing is intended to reduce lock and remote
+ * cacheline acquisitions, and give a cooling off period for remotely freed
+ * objects before they are re-allocated.
+ *
+ * node specific allocations from somewhere other than the local node are
+ * handled by a per-node list which is the same as the above per-CPU data
+ * structures except for the following differences:
+ *
+ * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
+ * allocate from a given node.
+ *
+ * - There is no remote free queue. Nodes don't free objects, CPUs do.
+ */
+
+static inline void slqb_stat_inc(struct kmem_cache_list *list,
+ enum stat_item si)
+{
+#ifdef CONFIG_SLQB_STATS
+ list->stats[si]++;
+#endif
+}
+
+static inline void slqb_stat_add(struct kmem_cache_list *list,
+ enum stat_item si, unsigned long nr)
+{
+#ifdef CONFIG_SLQB_STATS
+ list->stats[si] += nr;
+#endif
+}
+
+static inline int slqb_page_to_nid(struct slqb_page *page)
+{
+ return page_to_nid(&page->page);
+}
+
+static inline void *slqb_page_address(struct slqb_page *page)
+{
+ return page_address(&page->page);
+}
+
+static inline struct zone *slqb_page_zone(struct slqb_page *page)
+{
+ return page_zone(&page->page);
+}
+
+static inline int virt_to_nid(const void *addr)
+{
+ return page_to_nid(virt_to_page(addr));
+}
+
+static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
+{
+ struct page *p;
+
+ p = virt_to_head_page(addr);
+ return (struct slqb_page *)p;
+}
+
+static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
+ int pages)
+{
+ struct page *p = &page->page;
+
+ reset_page_mapcount(p);
+ p->mapping = NULL;
+ VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
+ p->flags &= ~PG_SLQB_BIT;
+
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += pages;
+ __free_pages(p, order);
+}
+
+#ifdef CONFIG_SLQB_DEBUG
+static inline int slab_debug(struct kmem_cache *s)
+{
+ return s->flags &
+ (SLAB_DEBUG_FREE |
+ SLAB_RED_ZONE |
+ SLAB_POISON |
+ SLAB_STORE_USER |
+ SLAB_TRACE);
+}
+static inline int slab_poison(struct kmem_cache *s)
+{
+ return s->flags & SLAB_POISON;
+}
+#else
+static inline int slab_debug(struct kmem_cache *s)
+{
+ return 0;
+}
+static inline int slab_poison(struct kmem_cache *s)
+{
+ return 0;
+}
+#endif
+
+#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_STORE_USER)
+
+/* Internal SLQB flags */
+#define __OBJECT_POISON 0x80000000 /* Poison object */
+
+/* Not all arches define cache_line_size */
+#ifndef cache_line_size
+#define cache_line_size() L1_CACHE_BYTES
+#endif
+
+#ifdef CONFIG_SMP
+static struct notifier_block slab_notifier;
+#endif
+
+/*
+ * slqb_lock protects slab_caches list and serialises hotplug operations.
+ * hotplug operations take lock for write, other operations can hold off
+ * hotplug by taking it for read (or write).
+ */
+static DECLARE_RWSEM(slqb_lock);
+
+/*
+ * A list of all slab caches on the system
+ */
+static LIST_HEAD(slab_caches);
+
+/*
+ * Tracking user of a slab.
+ */
+struct track {
+ unsigned long addr; /* Called from address */
+ int cpu; /* Was running on cpu */
+ int pid; /* Pid context */
+ unsigned long when; /* When did the operation occur */
+};
+
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+
+static struct kmem_cache kmem_cache_cache;
+
+#ifdef CONFIG_SLQB_SYSFS
+static int sysfs_slab_add(struct kmem_cache *s);
+static void sysfs_slab_remove(struct kmem_cache *s);
+#else
+static inline int sysfs_slab_add(struct kmem_cache *s)
+{
+ return 0;
+}
+static inline void sysfs_slab_remove(struct kmem_cache *s)
+{
+ kmem_cache_free(&kmem_cache_cache, s);
+}
+#endif
+
+/********************************************************************
+ * Core slab cache functions
+ *******************************************************************/
+
+static int __slab_is_available __read_mostly;
+int slab_is_available(void)
+{
+ return __slab_is_available;
+}
+
+static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
+{
+#ifdef CONFIG_SMP
+ VM_BUG_ON(!s->cpu_slab[cpu]);
+ return s->cpu_slab[cpu];
+#else
+ return &s->cpu_slab;
+#endif
+}
+
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct slqb_page *page, const void *object)
+{
+ void *base;
+
+ base = slqb_page_address(page);
+ if (object < base || object >= base + s->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+ return *(void **)(object + s->offset);
+}
+
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+ *(void **)(object + s->offset) = fp;
+}
+
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr) \
+ for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+ __p += (__s)->size)
+
+/* Scan freelist */
+#define for_each_free_object(__p, __s, __free) \
+ for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
+ __p))
+
+#ifdef CONFIG_SLQB_DEBUG
+/*
+ * Debug settings:
+ */
+#ifdef CONFIG_SLQB_DEBUG_ON
+static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
+#else
+static int slqb_debug __read_mostly;
+#endif
+
+static char *slqb_debug_slabs;
+
+/*
+ * Object debugging
+ */
+static void print_section(char *text, u8 *addr, unsigned int length)
+{
+ int i, offset;
+ int newline = 1;
+ char ascii[17];
+
+ ascii[16] = 0;
+
+ for (i = 0; i < length; i++) {
+ if (newline) {
+ printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
+ newline = 0;
+ }
+ printk(KERN_CONT " %02x", addr[i]);
+ offset = i % 16;
+ ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
+ if (offset == 15) {
+ printk(KERN_CONT " %s\n", ascii);
+ newline = 1;
+ }
+ }
+ if (!newline) {
+ i %= 16;
+ while (i < 16) {
+ printk(KERN_CONT " ");
+ ascii[i] = ' ';
+ i++;
+ }
+ printk(KERN_CONT " %s\n", ascii);
+ }
+}
+
+static struct track *get_track(struct kmem_cache *s, void *object,
+ enum track_item alloc)
+{
+ struct track *p;
+
+ if (s->offset)
+ p = object + s->offset + sizeof(void *);
+ else
+ p = object + s->inuse;
+
+ return p + alloc;
+}
+
+static void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr)
+{
+ struct track *p;
+
+ if (s->offset)
+ p = object + s->offset + sizeof(void *);
+ else
+ p = object + s->inuse;
+
+ p += alloc;
+ if (addr) {
+ p->addr = addr;
+ p->cpu = raw_smp_processor_id();
+ p->pid = current ? current->pid : -1;
+ p->when = jiffies;
+ } else
+ memset(p, 0, sizeof(struct track));
+}
+
+static void init_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ set_track(s, object, TRACK_FREE, 0UL);
+ set_track(s, object, TRACK_ALLOC, 0UL);
+}
+
+static void print_track(const char *s, struct track *t)
+{
+ if (!t->addr)
+ return;
+
+ printk(KERN_ERR "INFO: %s in ", s);
+ __print_symbol("%s", (unsigned long)t->addr);
+ printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+}
+
+static void print_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC));
+ print_track("Freed", get_track(s, object, TRACK_FREE));
+}
+
+static void print_page_info(struct slqb_page *page)
+{
+ printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->inuse, page->freelist, page->flags);
+
+}
+
+#define MAX_ERR_STR 100
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[MAX_ERR_STR];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "========================================"
+ "=====================================\n");
+ printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+ printk(KERN_ERR "----------------------------------------"
+ "-------------------------------------\n\n");
+}
+
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
+}
+
+static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
+{
+ unsigned int off; /* Offset of last byte */
+ u8 *addr = slqb_page_address(page);
+
+ print_tracking(s, p);
+
+ print_page_info(page);
+
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
+
+ if (p > addr + 16)
+ print_section("Bytes b4", p - 16, 16);
+
+ print_section("Object", p, min(s->objsize, 128));
+
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
+
+ if (s->offset)
+ off = s->offset + sizeof(void *);
+ else
+ off = s->inuse;
+
+ if (s->flags & SLAB_STORE_USER)
+ off += 2 * sizeof(struct track);
+
+ if (off != s->size) {
+ /* Beginning of the filler is the free pointer */
+ print_section("Padding", p + off, s->size - off);
+ }
+
+ dump_stack();
+}
+
+static void object_err(struct kmem_cache *s, struct slqb_page *page,
+ u8 *object, char *reason)
+{
+ slab_bug(s, reason);
+ print_trailer(s, page, object);
+}
+
+static void slab_err(struct kmem_cache *s, struct slqb_page *page,
+ char *fmt, ...)
+{
+ slab_bug(s, fmt);
+ print_page_info(page);
+ dump_stack();
+}
+
+static void init_object(struct kmem_cache *s, void *object, int active)
+{
+ u8 *p = object;
+
+ if (s->flags & __OBJECT_POISON) {
+ memset(p, POISON_FREE, s->objsize - 1);
+ p[s->objsize - 1] = POISON_END;
+ }
+
+ if (s->flags & SLAB_RED_ZONE) {
+ memset(p + s->objsize,
+ active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
+ s->inuse - s->objsize);
+ }
+}
+
+static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+{
+ while (bytes) {
+ if (*start != (u8)value)
+ return start;
+ start++;
+ bytes--;
+ }
+ return NULL;
+}
+
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+ void *from, void *to)
+{
+ slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ memset(from, data, to - from);
+}
+
+static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
+ u8 *object, char *what,
+ u8 *start, unsigned int value, unsigned int bytes)
+{
+ u8 *fault;
+ u8 *end;
+
+ fault = check_bytes(start, value, bytes);
+ if (!fault)
+ return 1;
+
+ end = start + bytes;
+ while (end > fault && end[-1] == value)
+ end--;
+
+ slab_bug(s, "%s overwritten", what);
+ printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault[0], value);
+ print_trailer(s, page, object);
+
+ restore_bytes(s, what, value, fault, end);
+ return 0;
+}
+
+/*
+ * Object layout:
+ *
+ * object address
+ * Bytes of the object to be managed.
+ * If the freepointer may overlay the object then the free
+ * pointer is the first word of the object.
+ *
+ * Poisoning uses 0x6b (POISON_FREE) and the last byte is
+ * 0xa5 (POISON_END)
+ *
+ * object + s->objsize
+ * Padding to reach word boundary. This is also used for Redzoning.
+ * Padding is extended by another word if Redzoning is enabled and
+ * objsize == inuse.
+ *
+ * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
+ * 0xcc (RED_ACTIVE) for objects in use.
+ *
+ * object + s->inuse
+ * Meta data starts here.
+ *
+ * A. Free pointer (if we cannot overwrite object on free)
+ * B. Tracking data for SLAB_STORE_USER
+ * C. Padding to reach required alignment boundary or at mininum
+ * one word if debuggin is on to be able to detect writes
+ * before the word boundary.
+ *
+ * Padding is done using 0x5a (POISON_INUSE)
+ *
+ * object + s->size
+ * Nothing is used beyond s->size.
+ */
+
+static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
+{
+ unsigned long off = s->inuse; /* The end of info */
+
+ if (s->offset) {
+ /* Freepointer is placed after the object. */
+ off += sizeof(void *);
+ }
+
+ if (s->flags & SLAB_STORE_USER) {
+ /* We also have user information there */
+ off += 2 * sizeof(struct track);
+ }
+
+ if (s->size == off)
+ return 1;
+
+ return check_bytes_and_report(s, page, p, "Object padding",
+ p + off, POISON_INUSE, s->size - off);
+}
+
+static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
+{
+ u8 *start;
+ u8 *fault;
+ u8 *end;
+ int length;
+ int remainder;
+
+ if (!(s->flags & SLAB_POISON))
+ return 1;
+
+ start = slqb_page_address(page);
+ end = start + (PAGE_SIZE << s->order);
+ length = s->objects * s->size;
+ remainder = end - (start + length);
+ if (!remainder)
+ return 1;
+
+ fault = check_bytes(start + length, POISON_INUSE, remainder);
+ if (!fault)
+ return 1;
+
+ while (end > fault && end[-1] == POISON_INUSE)
+ end--;
+
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ print_section("Padding", start, length);
+
+ restore_bytes(s, "slab padding", POISON_INUSE, start, end);
+ return 0;
+}
+
+static int check_object(struct kmem_cache *s, struct slqb_page *page,
+ void *object, int active)
+{
+ u8 *p = object;
+ u8 *endobject = object + s->objsize;
+
+ if (s->flags & SLAB_RED_ZONE) {
+ unsigned int red =
+ active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
+ endobject, red, s->inuse - s->objsize))
+ return 0;
+ } else {
+ if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+ check_bytes_and_report(s, page, p, "Alignment padding",
+ endobject, POISON_INUSE, s->inuse - s->objsize);
+ }
+ }
+
+ if (s->flags & SLAB_POISON) {
+ if (!active && (s->flags & __OBJECT_POISON)) {
+ if (!check_bytes_and_report(s, page, p, "Poison", p,
+ POISON_FREE, s->objsize - 1))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, p, "Poison",
+ p + s->objsize - 1, POISON_END, 1))
+ return 0;
+ }
+
+ /*
+ * check_pad_bytes cleans up on its own.
+ */
+ check_pad_bytes(s, page, p);
+ }
+
+ return 1;
+}
+
+static int check_slab(struct kmem_cache *s, struct slqb_page *page)
+{
+ if (!(page->flags & PG_SLQB_BIT)) {
+ slab_err(s, page, "Not a valid slab page");
+ return 0;
+ }
+ if (page->inuse == 0) {
+ slab_err(s, page, "inuse before free / after alloc", s->name);
+ return 0;
+ }
+ if (page->inuse > s->objects) {
+ slab_err(s, page, "inuse %u > max %u",
+ s->name, page->inuse, s->objects);
+ return 0;
+ }
+ /* Slab_pad_check fixes things up after itself */
+ slab_pad_check(s, page);
+ return 1;
+}
+
+static void trace(struct kmem_cache *s, struct slqb_page *page,
+ void *object, int alloc)
+{
+ if (s->flags & SLAB_TRACE) {
+ printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ s->name,
+ alloc ? "alloc" : "free",
+ object, page->inuse,
+ page->freelist);
+
+ if (!alloc)
+ print_section("Object", (void *)object, s->objsize);
+
+ dump_stack();
+ }
+}
+
+static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
+ void *object)
+{
+ if (!slab_debug(s))
+ return;
+
+ if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ return;
+
+ init_object(s, object, 0);
+ init_tracking(s, object);
+}
+
+static int alloc_debug_processing(struct kmem_cache *s,
+ void *object, unsigned long addr)
+{
+ struct slqb_page *page;
+ page = virt_to_head_slqb_page(object);
+
+ if (!check_slab(s, page))
+ goto bad;
+
+ if (!check_valid_pointer(s, page, object)) {
+ object_err(s, page, object, "Freelist Pointer check fails");
+ goto bad;
+ }
+
+ if (object && !check_object(s, page, object, 0))
+ goto bad;
+
+ /* Success perform special debug activities for allocs */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_ALLOC, addr);
+ trace(s, page, object, 1);
+ init_object(s, object, 1);
+ return 1;
+
+bad:
+ return 0;
+}
+
+static int free_debug_processing(struct kmem_cache *s,
+ void *object, unsigned long addr)
+{
+ struct slqb_page *page;
+ page = virt_to_head_slqb_page(object);
+
+ if (!check_slab(s, page))
+ goto fail;
+
+ if (!check_valid_pointer(s, page, object)) {
+ slab_err(s, page, "Invalid object pointer 0x%p", object);
+ goto fail;
+ }
+
+ if (!check_object(s, page, object, 1))
+ return 0;
+
+ /* Special debug activities for freeing objects */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_FREE, addr);
+ trace(s, page, object, 0);
+ init_object(s, object, 0);
+ return 1;
+
+fail:
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return 0;
+}
+
+static int __init setup_slqb_debug(char *str)
+{
+ slqb_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str) {
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+ }
+
+ if (*str == ',') {
+ /*
+ * No options but restriction on slabs. This means full
+ * debugging for slabs matching a pattern.
+ */
+ goto check_slabs;
+ }
+
+ slqb_debug = 0;
+ if (*str == '-') {
+ /*
+ * Switch off all debugging measures.
+ */
+ goto out;
+ }
+
+ /*
+ * Determine which debug features should be switched on
+ */
+ for (; *str && *str != ','; str++) {
+ switch (tolower(*str)) {
+ case 'f':
+ slqb_debug |= SLAB_DEBUG_FREE;
+ break;
+ case 'z':
+ slqb_debug |= SLAB_RED_ZONE;
+ break;
+ case 'p':
+ slqb_debug |= SLAB_POISON;
+ break;
+ case 'u':
+ slqb_debug |= SLAB_STORE_USER;
+ break;
+ case 't':
+ slqb_debug |= SLAB_TRACE;
+ break;
+ default:
+ printk(KERN_ERR "slqb_debug option '%c' "
+ "unknown. skipped\n", *str);
+ }
+ }
+
+check_slabs:
+ if (*str == ',')
+ slqb_debug_slabs = str + 1;
+out:
+ return 1;
+}
+__setup("slqb_debug", setup_slqb_debug);
+
+static int __init setup_slqb_min_order(char *str)
+{
+ get_option(&str, &slqb_min_order);
+ slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
+
+ return 1;
+}
+__setup("slqb_min_order=", setup_slqb_min_order);
+
+static int __init setup_slqb_min_objects(char *str)
+{
+ get_option(&str, &slqb_min_objects);
+
+ return 1;
+}
+
+__setup("slqb_min_objects=", setup_slqb_min_objects);
+
+static unsigned long kmem_cache_flags(unsigned long objsize,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ /*
+ * Enable debugging if selected on the kernel commandline.
+ */
+ if (slqb_debug && (!slqb_debug_slabs ||
+ strncmp(slqb_debug_slabs, name,
+ strlen(slqb_debug_slabs)) == 0))
+ flags |= slqb_debug;
+
+ if (num_possible_nodes() > 1)
+ flags |= SLAB_NUMA;
+
+ return flags;
+}
+#else
+static inline void setup_object_debug(struct kmem_cache *s,
+ struct slqb_page *page, void *object)
+{
+}
+
+static inline int alloc_debug_processing(struct kmem_cache *s,
+ void *object, unsigned long addr)
+{
+ return 0;
+}
+
+static inline int free_debug_processing(struct kmem_cache *s,
+ void *object, unsigned long addr)
+{
+ return 0;
+}
+
+static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
+{
+ return 1;
+}
+
+static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
+ void *object, int active)
+{
+ return 1;
+}
+
+static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
+{
+}
+
+static inline unsigned long kmem_cache_flags(unsigned long objsize,
+ unsigned long flags, const char *name, void (*ctor)(void *))
+{
+ if (num_possible_nodes() > 1)
+ flags |= SLAB_NUMA;
+ return flags;
+}
+
+static const int slqb_debug;
+#endif
+
+/*
+ * allocate a new slab (return its corresponding struct slqb_page)
+ */
+static struct slqb_page *allocate_slab(struct kmem_cache *s,
+ gfp_t flags, int node)
+{
+ struct slqb_page *page;
+ int pages = 1 << s->order;
+
+ flags |= s->allocflags;
+
+ page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
+ if (!page)
+ return NULL;
+
+ mod_zone_page_state(slqb_page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ pages);
+
+ return page;
+}
+
+/*
+ * Called once for each object on a new slab page
+ */
+static void setup_object(struct kmem_cache *s,
+ struct slqb_page *page, void *object)
+{
+ setup_object_debug(s, page, object);
+ if (unlikely(s->ctor))
+ s->ctor(object);
+}
+
+/*
+ * Allocate a new slab, set up its object list.
+ */
+static struct slqb_page *new_slab_page(struct kmem_cache *s,
+ gfp_t flags, int node, unsigned int colour)
+{
+ struct slqb_page *page;
+ void *start;
+ void *last;
+ void *p;
+
+ BUG_ON(flags & GFP_SLAB_BUG_MASK);
+
+ page = allocate_slab(s,
+ flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+ if (!page)
+ goto out;
+
+ page->flags |= PG_SLQB_BIT;
+
+ start = page_address(&page->page);
+
+ if (unlikely(slab_poison(s)))
+ memset(start, POISON_INUSE, PAGE_SIZE << s->order);
+
+ start += colour;
+
+ last = start;
+ for_each_object(p, s, start) {
+ setup_object(s, page, p);
+ set_freepointer(s, last, p);
+ last = p;
+ }
+ set_freepointer(s, last, NULL);
+
+ page->freelist = start;
+ page->inuse = 0;
+out:
+ return page;
+}
+
+/*
+ * Free a slab page back to the page allocator
+ */
+static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
+{
+ int pages = 1 << s->order;
+
+ if (unlikely(slab_debug(s))) {
+ void *p;
+
+ slab_pad_check(s, page);
+ for_each_free_object(p, s, page->freelist)
+ check_object(s, page, p, 0);
+ }
+
+ mod_zone_page_state(slqb_page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ -pages);
+
+ __free_slqb_pages(page, s->order, pages);
+}
+
+static void rcu_free_slab(struct rcu_head *h)
+{
+ struct slqb_page *page;
+
+ page = container_of(h, struct slqb_page, rcu_head);
+ __free_slab(page->list->cache, page);
+}
+
+static void free_slab(struct kmem_cache *s, struct slqb_page *page)
+{
+ VM_BUG_ON(page->inuse);
+ if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
+ call_rcu(&page->rcu_head, rcu_free_slab);
+ else
+ __free_slab(s, page);
+}
+
+/*
+ * Return an object to its slab.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static int free_object_to_page(struct kmem_cache *s,
+ struct kmem_cache_list *l, struct slqb_page *page,
+ void *object)
+{
+ VM_BUG_ON(page->list != l);
+
+ set_freepointer(s, object, page->freelist);
+ page->freelist = object;
+ page->inuse--;
+
+ if (!page->inuse) {
+ if (likely(s->objects > 1)) {
+ l->nr_partial--;
+ list_del(&page->lru);
+ }
+ l->nr_slabs--;
+ free_slab(s, page);
+ slqb_stat_inc(l, FLUSH_SLAB_FREE);
+ return 1;
+
+ } else if (page->inuse + 1 == s->objects) {
+ l->nr_partial++;
+ list_add(&page->lru, &l->partial);
+ slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
+ return 0;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
+ void *object, struct kmem_cache_cpu *c);
+#endif
+
+/*
+ * Flush the LIFO list of objects on a list. They are sent back to their pages
+ * in case the pages also belong to the list, or to our CPU's remote-free list
+ * in the case they do not.
+ *
+ * Doesn't flush the entire list. flush_free_list_all does.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
+{
+ void **head;
+ int nr;
+ int locked = 0;
+
+ nr = l->freelist.nr;
+ if (unlikely(!nr))
+ return;
+
+ nr = min(slab_freebatch(s), nr);
+
+ slqb_stat_inc(l, FLUSH_FREE_LIST);
+ slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
+
+ l->freelist.nr -= nr;
+ head = l->freelist.head;
+
+ do {
+ struct slqb_page *page;
+ void **object;
+
+ object = head;
+ VM_BUG_ON(!object);
+ head = get_freepointer(s, object);
+ page = virt_to_head_slqb_page(object);
+
+#ifdef CONFIG_SMP
+ if (page->list != l) {
+ struct kmem_cache_cpu *c;
+
+ if (locked) {
+ spin_unlock(&l->page_lock);
+ locked = 0;
+ }
+
+ c = get_cpu_slab(s, smp_processor_id());
+
+ slab_free_to_remote(s, page, object, c);
+ slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
+ } else
+#endif
+ {
+ if (!locked) {
+ spin_lock(&l->page_lock);
+ locked = 1;
+ }
+ free_object_to_page(s, l, page, object);
+ }
+
+ nr--;
+ } while (nr);
+
+ if (locked)
+ spin_unlock(&l->page_lock);
+
+ l->freelist.head = head;
+ if (!l->freelist.nr)
+ l->freelist.tail = NULL;
+}
+
+static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
+{
+ while (l->freelist.nr)
+ flush_free_list(s, l);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * If enough objects have been remotely freed back to this list,
+ * remote_free_check will be set. In which case, we'll eventually come here
+ * to take those objects off our remote_free list and onto our LIFO freelist.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static void claim_remote_free_list(struct kmem_cache *s,
+ struct kmem_cache_list *l)
+{
+ void **head, **tail;
+ int nr;
+
+ if (!l->remote_free.list.nr)
+ return;
+
+ spin_lock(&l->remote_free.lock);
+
+ l->remote_free_check = 0;
+ head = l->remote_free.list.head;
+ l->remote_free.list.head = NULL;
+ tail = l->remote_free.list.tail;
+ l->remote_free.list.tail = NULL;
+ nr = l->remote_free.list.nr;
+ l->remote_free.list.nr = 0;
+
+ spin_unlock(&l->remote_free.lock);
+
+ VM_BUG_ON(!nr);
+
+ if (!l->freelist.nr) {
+ /* Get head hot for likely subsequent allocation or flush */
+ prefetchw(head);
+ l->freelist.head = head;
+ } else
+ set_freepointer(s, l->freelist.tail, head);
+ l->freelist.tail = tail;
+
+ l->freelist.nr += nr;
+
+ slqb_stat_inc(l, CLAIM_REMOTE_LIST);
+ slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
+}
+#else
+static inline void claim_remote_free_list(struct kmem_cache *s,
+ struct kmem_cache_list *l)
+{
+}
+#endif
+
+/*
+ * Allocation fastpath. Get an object from the list's LIFO freelist, or
+ * return NULL if it is empty.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
+ struct kmem_cache_list *l)
+{
+ void *object;
+
+ object = l->freelist.head;
+ if (likely(object)) {
+ void *next = get_freepointer(s, object);
+
+ VM_BUG_ON(!l->freelist.nr);
+ l->freelist.nr--;
+ l->freelist.head = next;
+
+ return object;
+ }
+ VM_BUG_ON(l->freelist.nr);
+
+#ifdef CONFIG_SMP
+ if (unlikely(l->remote_free_check)) {
+ claim_remote_free_list(s, l);
+
+ if (l->freelist.nr > slab_hiwater(s))
+ flush_free_list(s, l);
+
+ /* repetition here helps gcc :( */
+ object = l->freelist.head;
+ if (likely(object)) {
+ void *next = get_freepointer(s, object);
+
+ VM_BUG_ON(!l->freelist.nr);
+ l->freelist.nr--;
+ l->freelist.head = next;
+
+ return object;
+ }
+ VM_BUG_ON(l->freelist.nr);
+ }
+#endif
+
+ return NULL;
+}
+
+/*
+ * Slow(er) path. Get a page from this list's existing pages. Will be a
+ * new empty page in the case that __slab_alloc_page has just been called
+ * (empty pages otherwise never get queued up on the lists), or a partial page
+ * already on the list.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static noinline void *__cache_list_get_page(struct kmem_cache *s,
+ struct kmem_cache_list *l)
+{
+ struct slqb_page *page;
+ void *object;
+
+ if (unlikely(!l->nr_partial))
+ return NULL;
+
+ page = list_first_entry(&l->partial, struct slqb_page, lru);
+ VM_BUG_ON(page->inuse == s->objects);
+ if (page->inuse + 1 == s->objects) {
+ l->nr_partial--;
+ list_del(&page->lru);
+ }
+
+ VM_BUG_ON(!page->freelist);
+
+ page->inuse++;
+
+ object = page->freelist;
+ page->freelist = get_freepointer(s, object);
+ if (page->freelist)
+ prefetchw(page->freelist);
+ VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
+ slqb_stat_inc(l, ALLOC_SLAB_FILL);
+
+ return object;
+}
+
+static void *cache_list_get_page(struct kmem_cache *s,
+ struct kmem_cache_list *l)
+{
+ void *object;
+
+ if (unlikely(!l->nr_partial))
+ return NULL;
+
+ spin_lock(&l->page_lock);
+ object = __cache_list_get_page(s, l);
+ spin_unlock(&l->page_lock);
+
+ return object;
+}
+
+/*
+ * Allocation slowpath. Allocate a new slab page from the page allocator, and
+ * put it on the list's partial list. Must be followed by an allocation so
+ * that we don't have dangling empty pages on the partial list.
+ *
+ * Returns 0 on allocation failure.
+ *
+ * Must be called with interrupts disabled.
+ */
+static noinline void *__slab_alloc_page(struct kmem_cache *s,
+ gfp_t gfpflags, int node)
+{
+ struct slqb_page *page;
+ struct kmem_cache_list *l;
+ struct kmem_cache_cpu *c;
+ unsigned int colour;
+ void *object;
+
+ c = get_cpu_slab(s, smp_processor_id());
+ colour = c->colour_next;
+ c->colour_next += s->colour_off;
+ if (c->colour_next >= s->colour_range)
+ c->colour_next = 0;
+
+ /* Caller handles __GFP_ZERO */
+ gfpflags &= ~__GFP_ZERO;
+
+ if (gfpflags & __GFP_WAIT)
+ local_irq_enable();
+ page = new_slab_page(s, gfpflags, node, colour);
+ if (gfpflags & __GFP_WAIT)
+ local_irq_disable();
+ if (unlikely(!page))
+ return page;
+
+ if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
+ struct kmem_cache_cpu *c;
+ int cpu = smp_processor_id();
+
+ c = get_cpu_slab(s, cpu);
+ l = &c->list;
+ page->list = l;
+
+ spin_lock(&l->page_lock);
+ l->nr_slabs++;
+ l->nr_partial++;
+ list_add(&page->lru, &l->partial);
+ slqb_stat_inc(l, ALLOC);
+ slqb_stat_inc(l, ALLOC_SLAB_NEW);
+ object = __cache_list_get_page(s, l);
+ spin_unlock(&l->page_lock);
+ } else {
+#ifdef CONFIG_NUMA
+ struct kmem_cache_node *n;
+
+ n = s->node_slab[slqb_page_to_nid(page)];
+ l = &n->list;
+ page->list = l;
+
+ spin_lock(&n->list_lock);
+ spin_lock(&l->page_lock);
+ l->nr_slabs++;
+ l->nr_partial++;
+ list_add(&page->lru, &l->partial);
+ slqb_stat_inc(l, ALLOC);
+ slqb_stat_inc(l, ALLOC_SLAB_NEW);
+ object = __cache_list_get_page(s, l);
+ spin_unlock(&l->page_lock);
+ spin_unlock(&n->list_lock);
+#endif
+ }
+ VM_BUG_ON(!object);
+ return object;
+}
+
+#ifdef CONFIG_NUMA
+static noinline int alternate_nid(struct kmem_cache *s,
+ gfp_t gfpflags, int node)
+{
+ if (in_interrupt() || (gfpflags & __GFP_THISNODE))
+ return node;
+ if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
+ return cpuset_mem_spread_node();
+ else if (current->mempolicy)
+ return slab_node(current->mempolicy);
+ return node;
+}
+
+/*
+ * Allocate an object from a remote node. Return NULL if none could be found
+ * (in which case, caller should allocate a new slab)
+ *
+ * Must be called with interrupts disabled.
+ */
+static void *__remote_slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache_list *l;
+ void *object;
+
+ n = s->node_slab[node];
+ if (unlikely(!n)) /* node has no memory */
+ return NULL;
+ l = &n->list;
+
+ spin_lock(&n->list_lock);
+
+ object = __cache_list_get_object(s, l);
+ if (unlikely(!object)) {
+ object = cache_list_get_page(s, l);
+ if (unlikely(!object)) {
+ spin_unlock(&n->list_lock);
+ return __slab_alloc_page(s, gfpflags, node);
+ }
+ }
+ if (likely(object))
+ slqb_stat_inc(l, ALLOC);
+ spin_unlock(&n->list_lock);
+ return object;
+}
+
+static noinline void *__remote_slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node)
+{
+ void *object;
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(gfpflags);
+
+ object = __remote_slab_alloc_node(s, gfpflags, node);
+ if (likely(object || (gfpflags & __GFP_THISNODE)))
+ return object;
+
+ zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
+ continue;
+
+ node = zone_to_nid(zone);
+ object = __remote_slab_alloc_node(s, gfpflags, node);
+ if (likely(object))
+ return object;
+ }
+ return NULL;
+}
+#endif
+
+/*
+ * Main allocation path. Return an object, or NULL on allocation failure.
+ *
+ * Must be called with interrupts disabled.
+ */
+static __always_inline void *__slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node)
+{
+ void *object;
+ struct kmem_cache_cpu *c;
+ struct kmem_cache_list *l;
+
+#ifdef CONFIG_NUMA
+ if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
+try_remote:
+ return __remote_slab_alloc(s, gfpflags, node);
+ }
+#endif
+
+ c = get_cpu_slab(s, smp_processor_id());
+ VM_BUG_ON(!c);
+ l = &c->list;
+ object = __cache_list_get_object(s, l);
+ if (unlikely(!object)) {
+ object = cache_list_get_page(s, l);
+ if (unlikely(!object)) {
+ object = __slab_alloc_page(s, gfpflags, node);
+#ifdef CONFIG_NUMA
+ if (unlikely(!object)) {
+ node = numa_node_id();
+ goto try_remote;
+ }
+#endif
+ return object;
+ }
+ }
+ if (likely(object))
+ slqb_stat_inc(l, ALLOC);
+ return object;
+}
+
+/*
+ * Perform some interrupts-on processing around the main allocation path
+ * (debug checking and memset()ing).
+ */
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr)
+{
+ void *object;
+ unsigned long flags;
+
+ gfpflags &= gfp_allowed_mask;
+
+again:
+ local_irq_save(flags);
+ object = __slab_alloc(s, gfpflags, node);
+ local_irq_restore(flags);
+
+ if (unlikely(slab_debug(s)) && likely(object)) {
+ if (unlikely(!alloc_debug_processing(s, object, addr)))
+ goto again;
+ }
+
+ if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
+ memset(object, 0, s->objsize);
+
+ return object;
+}
+
+static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, unsigned long caller)
+{
+ int node = -1;
+
+#ifdef CONFIG_NUMA
+ if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+ node = alternate_nid(s, gfpflags, node);
+#endif
+ return slab_alloc(s, gfpflags, node, caller);
+}
+
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+{
+ return slab_alloc(s, gfpflags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
+
+#ifdef CONFIG_SMP
+/*
+ * Flush this CPU's remote free list of objects back to the list from where
+ * they originate. They end up on that list's remotely freed list, and
+ * eventually we set it's remote_free_check if there are enough objects on it.
+ *
+ * This seems convoluted, but it keeps is from stomping on the target CPU's
+ * fastpath cachelines.
+ *
+ * Must be called with interrupts disabled.
+ */
+static void flush_remote_free_cache(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+ struct kmlist *src;
+ struct kmem_cache_list *dst;
+ unsigned int nr;
+ int set;
+
+ src = &c->rlist;
+ nr = src->nr;
+ if (unlikely(!nr))
+ return;
+
+#ifdef CONFIG_SLQB_STATS
+ {
+ struct kmem_cache_list *l = &c->list;
+
+ slqb_stat_inc(l, FLUSH_RFREE_LIST);
+ slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
+ }
+#endif
+
+ dst = c->remote_cache_list;
+
+ /*
+ * Less common case, dst is filling up so free synchronously.
+ * No point in having remote CPU free thse as it will just
+ * free them back to the page list anyway.
+ */
+ if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
+ void **head;
+
+ head = src->head;
+ spin_lock(&dst->page_lock);
+ do {
+ struct slqb_page *page;
+ void **object;
+
+ object = head;
+ VM_BUG_ON(!object);
+ head = get_freepointer(s, object);
+ page = virt_to_head_slqb_page(object);
+
+ free_object_to_page(s, dst, page, object);
+ nr--;
+ } while (nr);
+ spin_unlock(&dst->page_lock);
+
+ src->head = NULL;
+ src->tail = NULL;
+ src->nr = 0;
+
+ return;
+ }
+
+ spin_lock(&dst->remote_free.lock);
+
+ if (!dst->remote_free.list.head)
+ dst->remote_free.list.head = src->head;
+ else
+ set_freepointer(s, dst->remote_free.list.tail, src->head);
+ dst->remote_free.list.tail = src->tail;
+
+ src->head = NULL;
+ src->tail = NULL;
+ src->nr = 0;
+
+ if (dst->remote_free.list.nr < slab_freebatch(s))
+ set = 1;
+ else
+ set = 0;
+
+ dst->remote_free.list.nr += nr;
+
+ if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
+ dst->remote_free_check = 1;
+
+ spin_unlock(&dst->remote_free.lock);
+}
+
+/*
+ * Free an object to this CPU's remote free list.
+ *
+ * Must be called with interrupts disabled.
+ */
+static noinline void slab_free_to_remote(struct kmem_cache *s,
+ struct slqb_page *page, void *object,
+ struct kmem_cache_cpu *c)
+{
+ struct kmlist *r;
+
+ /*
+ * Our remote free list corresponds to a different list. Must
+ * flush it and switch.
+ */
+ if (page->list != c->remote_cache_list) {
+ flush_remote_free_cache(s, c);
+ c->remote_cache_list = page->list;
+ }
+
+ r = &c->rlist;
+ if (!r->head)
+ r->head = object;
+ else
+ set_freepointer(s, r->tail, object);
+ set_freepointer(s, object, NULL);
+ r->tail = object;
+ r->nr++;
+
+ if (unlikely(r->nr >= slab_freebatch(s)))
+ flush_remote_free_cache(s, c);
+}
+#endif
+
+/*
+ * Main freeing path. Return an object, or NULL on allocation failure.
+ *
+ * Must be called with interrupts disabled.
+ */
+static __always_inline void __slab_free(struct kmem_cache *s,
+ struct slqb_page *page, void *object)
+{
+ struct kmem_cache_cpu *c;
+ struct kmem_cache_list *l;
+ int thiscpu = smp_processor_id();
+
+ c = get_cpu_slab(s, thiscpu);
+ l = &c->list;
+
+ slqb_stat_inc(l, FREE);
+
+ if (!NUMA_BUILD || !slab_numa(s) ||
+ likely(slqb_page_to_nid(page) == numa_node_id())) {
+ /*
+ * Freeing fastpath. Collects all local-node objects, not
+ * just those allocated from our per-CPU list. This allows
+ * fast transfer of objects from one CPU to another within
+ * a given node.
+ */
+ set_freepointer(s, object, l->freelist.head);
+ l->freelist.head = object;
+ if (!l->freelist.nr)
+ l->freelist.tail = object;
+ l->freelist.nr++;
+
+ if (unlikely(l->freelist.nr > slab_hiwater(s)))
+ flush_free_list(s, l);
+
+ } else {
+#ifdef CONFIG_SMP
+ /*
+ * Freeing an object that was allocated on a remote node.
+ */
+ slab_free_to_remote(s, page, object, c);
+ slqb_stat_inc(l, FREE_REMOTE);
+#endif
+ }
+}
+
+/*
+ * Perform some interrupts-on processing around the main freeing path
+ * (debug checking).
+ */
+static __always_inline void slab_free(struct kmem_cache *s,
+ struct slqb_page *page, void *object)
+{
+ unsigned long flags;
+
+ prefetchw(object);
+
+ debug_check_no_locks_freed(object, s->objsize);
+ if (likely(object) && unlikely(slab_debug(s))) {
+ if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
+ return;
+ }
+
+ local_irq_save(flags);
+ __slab_free(s, page, object);
+ local_irq_restore(flags);
+}
+
+void kmem_cache_free(struct kmem_cache *s, void *object)
+{
+ struct slqb_page *page = NULL;
+
+ if (slab_numa(s))
+ page = virt_to_head_slqb_page(object);
+ slab_free(s, page, object);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+/*
+ * Calculate the order of allocation given an slab object size.
+ *
+ * Order 0 allocations are preferred since order 0 does not cause fragmentation
+ * in the page allocator, and they have fastpaths in the page allocator. But
+ * also minimise external fragmentation with large objects.
+ */
+static int slab_order(int size, int max_order, int frac)
+{
+ int order;
+
+ if (fls(size - 1) <= PAGE_SHIFT)
+ order = 0;
+ else
+ order = fls(size - 1) - PAGE_SHIFT;
+ if (order < slqb_min_order)
+ order = slqb_min_order;
+
+ while (order <= max_order) {
+ unsigned long slab_size = PAGE_SIZE << order;
+ unsigned long objects;
+ unsigned long waste;
+
+ objects = slab_size / size;
+ if (!objects)
+ goto next;
+
+ if (order < MAX_ORDER && objects < slqb_min_objects) {
+ /*
+ * if we don't have enough objects for min_objects,
+ * then try the next size up. Unless we have reached
+ * our maximum possible page size.
+ */
+ goto next;
+ }
+
+ waste = slab_size - (objects * size);
+
+ if (waste * frac <= slab_size)
+ break;
+
+next:
+ order++;
+ }
+
+ return order;
+}
+
+static int calculate_order(int size)
+{
+ int order;
+
+ /*
+ * Attempt to find best configuration for a slab. This
+ * works by first attempting to generate a layout with
+ * the best configuration and backing off gradually.
+ */
+ order = slab_order(size, 1, 4);
+ if (order <= 1)
+ return order;
+
+ /*
+ * This size cannot fit in order-1. Allow bigger orders, but
+ * forget about trying to save space.
+ */
+ order = slab_order(size, MAX_ORDER - 1, 0);
+ if (order < MAX_ORDER)
+ return order;
+
+ return -ENOSYS;
+}
+
+/*
+ * Figure out what the alignment of the objects will be.
+ */
+static unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+static void init_kmem_cache_list(struct kmem_cache *s,
+ struct kmem_cache_list *l)
+{
+ l->cache = s;
+ l->freelist.nr = 0;
+ l->freelist.head = NULL;
+ l->freelist.tail = NULL;
+ l->nr_partial = 0;
+ l->nr_slabs = 0;
+ INIT_LIST_HEAD(&l->partial);
+ spin_lock_init(&l->page_lock);
+
+#ifdef CONFIG_SMP
+ l->remote_free_check = 0;
+ spin_lock_init(&l->remote_free.lock);
+ l->remote_free.list.nr = 0;
+ l->remote_free.list.head = NULL;
+ l->remote_free.list.tail = NULL;
+#endif
+
+#ifdef CONFIG_SLQB_STATS
+ memset(l->stats, 0, sizeof(l->stats));
+#endif
+}
+
+static void init_kmem_cache_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+ init_kmem_cache_list(s, &c->list);
+
+ c->colour_next = 0;
+#ifdef CONFIG_SMP
+ c->rlist.nr = 0;
+ c->rlist.head = NULL;
+ c->rlist.tail = NULL;
+ c->remote_cache_list = NULL;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+static void init_kmem_cache_node(struct kmem_cache *s,
+ struct kmem_cache_node *n)
+{
+ spin_lock_init(&n->list_lock);
+ init_kmem_cache_list(s, &n->list);
+}
+#endif
+
+/* Initial slabs. */
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
+#endif
+#ifdef CONFIG_NUMA
+/* XXX: really need a DEFINE_PER_NODE for per-node data, but this is better than
+ * a static array */
+static DEFINE_PER_CPU(struct kmem_cache_node, kmem_cache_nodes);
+#endif
+
+#ifdef CONFIG_SMP
+static struct kmem_cache kmem_cpu_cache;
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
+#ifdef CONFIG_NUMA
+static DEFINE_PER_CPU(struct kmem_cache_node, kmem_cpu_nodes); /* XXX per-nid */
+#endif
+#endif
+
+#ifdef CONFIG_NUMA
+static struct kmem_cache kmem_node_cache;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
+#endif
+static DEFINE_PER_CPU(struct kmem_cache_node, kmem_node_nodes); /*XXX per-nid */
+#endif
+
+#ifdef CONFIG_SMP
+static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
+ int cpu)
+{
+ struct kmem_cache_cpu *c;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
+ if (!c)
+ return NULL;
+
+ init_kmem_cache_cpu(s, c);
+ return c;
+}
+
+static void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c;
+
+ c = s->cpu_slab[cpu];
+ if (c) {
+ kmem_cache_free(&kmem_cpu_cache, c);
+ s->cpu_slab[cpu] = NULL;
+ }
+ }
+}
+
+static int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c;
+
+ c = s->cpu_slab[cpu];
+ if (c)
+ continue;
+
+ c = alloc_kmem_cache_cpu(s, cpu);
+ if (!c) {
+ free_kmem_cache_cpus(s);
+ return 0;
+ }
+ s->cpu_slab[cpu] = c;
+ }
+ return 1;
+}
+
+#else
+static inline void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+}
+
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ init_kmem_cache_cpu(s, &s->cpu_slab);
+ return 1;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+
+ n = s->node_slab[node];
+ if (n) {
+ kmem_cache_free(&kmem_node_cache, n);
+ s->node_slab[node] = NULL;
+ }
+ }
+}
+
+static int alloc_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+
+ n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
+ if (!n) {
+ free_kmem_cache_nodes(s);
+ return 0;
+ }
+ init_kmem_cache_node(s, n);
+ s->node_slab[node] = n;
+ }
+ return 1;
+}
+#else
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+}
+
+static int alloc_kmem_cache_nodes(struct kmem_cache *s)
+{
+ return 1;
+}
+#endif
+
+/*
+ * calculate_sizes() determines the order and the distribution of data within
+ * a slab object.
+ */
+static int calculate_sizes(struct kmem_cache *s)
+{
+ unsigned long flags = s->flags;
+ unsigned long size = s->objsize;
+ unsigned long align = s->align;
+
+ /*
+ * Determine if we can poison the object itself. If the user of
+ * the slab may touch the object after free or before allocation
+ * then we should never poison the object itself.
+ */
+ if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
+ s->flags |= __OBJECT_POISON;
+ else
+ s->flags &= ~__OBJECT_POISON;
+
+ /*
+ * Round up object size to the next word boundary. We can only
+ * place the free pointer at word boundaries and this determines
+ * the possible location of the free pointer.
+ */
+ size = ALIGN(size, sizeof(void *));
+
+#ifdef CONFIG_SLQB_DEBUG
+ /*
+ * If we are Redzoning then check if there is some space between the
+ * end of the object and the free pointer. If not then add an
+ * additional word to have some bytes to store Redzone information.
+ */
+ if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+ size += sizeof(void *);
+#endif
+
+ /*
+ * With that we have determined the number of bytes in actual use
+ * by the object. This is the potential offset to the free pointer.
+ */
+ s->inuse = size;
+
+ if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
+ /*
+ * Relocate free pointer after the object if it is not
+ * permitted to overwrite the first word of the object on
+ * kmem_cache_free.
+ *
+ * This is the case if we do RCU, have a constructor or
+ * destructor or are poisoning the objects.
+ */
+ s->offset = size;
+ size += sizeof(void *);
+ }
+
+#ifdef CONFIG_SLQB_DEBUG
+ if (flags & SLAB_STORE_USER) {
+ /*
+ * Need to store information about allocs and frees after
+ * the object.
+ */
+ size += 2 * sizeof(struct track);
+ }
+
+ if (flags & SLAB_RED_ZONE) {
+ /*
+ * Add some empty padding so that we can catch
+ * overwrites from earlier objects rather than let
+ * tracking information or the free pointer be
+ * corrupted if an user writes before the start
+ * of the object.
+ */
+ size += sizeof(void *);
+ }
+#endif
+
+ /*
+ * Determine the alignment based on various parameters that the
+ * user specified and the dynamic determination of cache line size
+ * on bootup.
+ */
+ align = calculate_alignment(flags, align, s->objsize);
+
+ /*
+ * SLQB stores one object immediately after another beginning from
+ * offset 0. In order to align the objects we have to simply size
+ * each object to conform to the alignment.
+ */
+ size = ALIGN(size, align);
+ s->size = size;
+ s->order = calculate_order(size);
+
+ if (s->order < 0)
+ return 0;
+
+ s->allocflags = 0;
+ if (s->order)
+ s->allocflags |= __GFP_COMP;
+
+ if (s->flags & SLAB_CACHE_DMA)
+ s->allocflags |= SLQB_DMA;
+
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ s->allocflags |= __GFP_RECLAIMABLE;
+
+ /*
+ * Determine the number of objects per slab
+ */
+ s->objects = (PAGE_SIZE << s->order) / size;
+
+ s->freebatch = max(4UL*PAGE_SIZE / size,
+ min(256UL, 64*PAGE_SIZE / size));
+ if (!s->freebatch)
+ s->freebatch = 1;
+ s->hiwater = s->freebatch << 2;
+
+ return !!s->objects;
+
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Per-cpu allocator can't be used because it always uses slab allocator,
+ * and it can't do per-node allocations.
+ */
+static void *kmem_cache_dyn_array_alloc(int ids)
+{
+ size_t size = sizeof(void *) * ids;
+
+ BUG_ON(!size);
+
+ if (unlikely(!slab_is_available())) {
+ static void *nextmem;
+ static size_t nextleft;
+ void *ret;
+
+ /*
+ * Special case for setting up initial caches. These will
+ * never get freed by definition so we can do it rather
+ * simply.
+ */
+ if (size > nextleft) {
+ nextmem = alloc_pages_exact(size, GFP_KERNEL);
+ if (!nextmem)
+ return NULL;
+ nextleft = roundup(size, PAGE_SIZE);
+ }
+
+ ret = nextmem;
+ nextleft -= size;
+ nextmem += size;
+ memset(ret, 0, size);
+ return ret;
+ } else {
+ return kzalloc(size, GFP_KERNEL);
+ }
+}
+
+static void kmem_cache_dyn_array_free(void *array)
+{
+ if (unlikely(!slab_is_available()))
+ return; /* error case without crashing here (will panic soon) */
+ kfree(array);
+}
+#endif
+
+static int kmem_cache_open(struct kmem_cache *s,
+ const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *), int alloc)
+{
+ unsigned int left_over;
+
+ memset(s, 0, sizeof(struct kmem_cache));
+ s->name = name;
+ s->ctor = ctor;
+ s->objsize = size;
+ s->align = align;
+ s->flags = kmem_cache_flags(size, flags, name, ctor);
+
+ if (!calculate_sizes(s))
+ goto error;
+
+ if (!slab_debug(s)) {
+ left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
+ s->colour_off = max(cache_line_size(), s->align);
+ s->colour_range = left_over;
+ } else {
+ s->colour_off = 0;
+ s->colour_range = 0;
+ }
+
+ /*
+ * Protect all alloc_kmem_cache_cpus/nodes allocations with slqb_lock
+ * to lock out hotplug, just in case (probably not strictly needed
+ * here).
+ */
+ down_write(&slqb_lock);
+#ifdef CONFIG_SMP
+ s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
+ if (!s->cpu_slab)
+ goto error_lock;
+# ifdef CONFIG_NUMA
+ s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
+ if (!s->node_slab)
+ goto error_cpu_array;
+# endif
+#endif
+
+ if (likely(alloc)) {
+ if (!alloc_kmem_cache_nodes(s))
+ goto error_node_array;
+
+ if (!alloc_kmem_cache_cpus(s))
+ goto error_nodes;
+ }
+
+ sysfs_slab_add(s);
+ list_add(&s->list, &slab_caches);
+ up_write(&slqb_lock);
+
+ return 1;
+
+error_nodes:
+ free_kmem_cache_nodes(s);
+error_node_array:
+#if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
+ kmem_cache_dyn_array_free(s->node_slab);
+error_cpu_array:
+#endif
+#ifdef CONFIG_SMP
+ kmem_cache_dyn_array_free(s->cpu_slab);
+error_lock:
+#endif
+ up_write(&slqb_lock);
+error:
+ if (flags & SLAB_PANIC)
+ panic("%s: failed to create slab `%s'\n", __func__, name);
+ return 0;
+}
+
+/**
+ * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
+ * @s: the cache we're checking against
+ * @ptr: pointer to validate
+ *
+ * This verifies that the untrusted pointer looks sane;
+ * it is _not_ a guarantee that the pointer is actually
+ * part of the slab cache in question, but it at least
+ * validates that the pointer can be dereferenced and
+ * looks half-way sane.
+ *
+ * Currently only used for dentry validation.
+ */
+int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
+{
+ unsigned long addr = (unsigned long)ptr;
+ struct slqb_page *page;
+
+ if (unlikely(addr < PAGE_OFFSET))
+ goto out;
+ if (unlikely(addr > (unsigned long)high_memory - s->size))
+ goto out;
+ if (unlikely(!IS_ALIGNED(addr, s->align)))
+ goto out;
+ if (unlikely(!kern_addr_valid(addr)))
+ goto out;
+ if (unlikely(!kern_addr_valid(addr + s->size - 1)))
+ goto out;
+ if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
+ goto out;
+ page = virt_to_head_slqb_page(ptr);
+ if (unlikely(!(page->flags & PG_SLQB_BIT)))
+ goto out;
+ if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
+ goto out;
+ return 1;
+out:
+ return 0;
+}
+EXPORT_SYMBOL(kmem_ptr_validate);
+
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+ return s->objsize;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *s)
+{
+ return s->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+/*
+ * Release all resources used by a slab cache. No more concurrency on the
+ * slab, so we can touch remote kmem_cache_cpu structures.
+ */
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+#ifdef CONFIG_NUMA
+ int node;
+#endif
+ int cpu;
+
+ down_write(&slqb_lock);
+ list_del(&s->list);
+
+ local_irq_disable();
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_list *l = &c->list;
+
+ flush_free_list_all(s, l);
+ flush_remote_free_cache(s, c);
+ }
+#endif
+
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_list *l = &c->list;
+
+ claim_remote_free_list(s, l);
+ flush_free_list_all(s, l);
+
+ WARN_ON(l->freelist.nr);
+ WARN_ON(l->nr_slabs);
+ WARN_ON(l->nr_partial);
+ }
+
+ free_kmem_cache_cpus(s);
+
+#ifdef CONFIG_NUMA
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+ struct kmem_cache_list *l;
+
+ n = s->node_slab[node];
+ if (!n)
+ continue;
+ l = &n->list;
+
+ claim_remote_free_list(s, l);
+ flush_free_list_all(s, l);
+
+ WARN_ON(l->freelist.nr);
+ WARN_ON(l->nr_slabs);
+ WARN_ON(l->nr_partial);
+ }
+
+ free_kmem_cache_nodes(s);
+#endif
+ local_irq_enable();
+
+ sysfs_slab_remove(s);
+ up_write(&slqb_lock);
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+/********************************************************************
+ * Kmalloc subsystem
+ *******************************************************************/
+
+struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
+EXPORT_SYMBOL(kmalloc_caches);
+
+#ifdef CONFIG_ZONE_DMA
+struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
+EXPORT_SYMBOL(kmalloc_caches_dma);
+#endif
+
+#ifndef ARCH_KMALLOC_FLAGS
+#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
+#endif
+
+static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
+ const char *name, int size, gfp_t gfp_flags)
+{
+ unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
+
+ if (gfp_flags & SLQB_DMA)
+ flags |= SLAB_CACHE_DMA;
+
+ kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
+
+ return s;
+}
+
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] __cacheline_aligned = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+#if L1_CACHE_BYTES < 64
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+#else
+ 7,
+ 7,
+ 7,
+ 7,
+#endif
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+#if L1_CACHE_BYTES < 128
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+#else
+ -1,
+ -1,
+ -1,
+ -1,
+ -1,
+ -1,
+ -1,
+ -1
+#endif
+};
+
+static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+{
+ int index;
+
+ if (unlikely(size <= KMALLOC_MIN_SIZE)) {
+ if (unlikely(!size))
+ return ZERO_SIZE_PTR;
+
+ index = KMALLOC_SHIFT_LOW;
+ goto got_index;
+ }
+
+#if L1_CACHE_BYTES >= 128
+ if (size <= 128) {
+#else
+ if (size <= 192) {
+#endif
+ index = size_index[(size - 1) / 8];
+ } else {
+ if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
+ return NULL;
+
+ index = fls(size - 1);
+ }
+
+got_index:
+ if (unlikely((flags & SLQB_DMA)))
+ return &kmalloc_caches_dma[index];
+ else
+ return &kmalloc_caches[index];
+}
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ struct kmem_cache *s;
+
+ s = get_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return __kmem_cache_alloc(s, flags, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ struct kmem_cache *s;
+
+ s = get_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return kmem_cache_alloc_node(s, flags, node);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif
+
+size_t ksize(const void *object)
+{
+ struct slqb_page *page;
+ struct kmem_cache *s;
+
+ BUG_ON(!object);
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ page = virt_to_head_slqb_page(object);
+ BUG_ON(!(page->flags & PG_SLQB_BIT));
+
+ s = page->list->cache;
+
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->objsize;
+
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+}
+EXPORT_SYMBOL(ksize);
+
+void kfree(const void *object)
+{
+ struct kmem_cache *s;
+ struct slqb_page *page;
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ page = virt_to_head_slqb_page(object);
+ s = page->list->cache;
+
+ slab_free(s, page, (void *)object);
+}
+EXPORT_SYMBOL(kfree);
+
+static void kmem_cache_trim_percpu(void *arg)
+{
+ int cpu = smp_processor_id();
+ struct kmem_cache *s = arg;
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_list *l = &c->list;
+
+ claim_remote_free_list(s, l);
+ flush_free_list(s, l);
+#ifdef CONFIG_SMP
+ flush_remote_free_cache(s, c);
+#endif
+}
+
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+#ifdef CONFIG_NUMA
+ int node;
+#endif
+
+ on_each_cpu(kmem_cache_trim_percpu, s, 1);
+
+#ifdef CONFIG_NUMA
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+ struct kmem_cache_list *l;
+
+ n = s->node_slab[node];
+ if (!n)
+ continue;
+ l = &n->list;
+
+ spin_lock_irq(&n->list_lock);
+ claim_remote_free_list(s, l);
+ flush_free_list(s, l);
+ spin_unlock_irq(&n->list_lock);
+ }
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+static void kmem_cache_reap_percpu(void *arg)
+{
+ int cpu = smp_processor_id();
+ struct kmem_cache *s;
+ long phase = (long)arg;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_list *l = &c->list;
+
+ if (phase == 0) {
+ flush_free_list_all(s, l);
+ flush_remote_free_cache(s, c);
+ }
+
+ if (phase == 1) {
+ claim_remote_free_list(s, l);
+ flush_free_list_all(s, l);
+ }
+ }
+}
+
+static void kmem_cache_reap(void)
+{
+ struct kmem_cache *s;
+ int node;
+
+ down_read(&slqb_lock);
+ on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
+ on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+ struct kmem_cache_list *l;
+
+ n = s->node_slab[node];
+ if (!n)
+ continue;
+ l = &n->list;
+
+ spin_lock_irq(&n->list_lock);
+ claim_remote_free_list(s, l);
+ flush_free_list_all(s, l);
+ spin_unlock_irq(&n->list_lock);
+ }
+ }
+ up_read(&slqb_lock);
+}
+#endif
+
+static void cache_trim_worker(struct work_struct *w)
+{
+ struct delayed_work *work =
+ container_of(w, struct delayed_work, work);
+ struct kmem_cache *s;
+
+ if (!down_read_trylock(&slqb_lock))
+ goto out;
+
+ list_for_each_entry(s, &slab_caches, list) {
+#ifdef CONFIG_NUMA
+ int node = numa_node_id();
+ struct kmem_cache_node *n = s->node_slab[node];
+
+ if (n) {
+ struct kmem_cache_list *l = &n->list;
+
+ spin_lock_irq(&n->list_lock);
+ claim_remote_free_list(s, l);
+ flush_free_list(s, l);
+ spin_unlock_irq(&n->list_lock);
+ }
+#endif
+
+ local_irq_disable();
+ kmem_cache_trim_percpu(s);
+ local_irq_enable();
+ }
+
+ up_read(&slqb_lock);
+out:
+ schedule_delayed_work(work, round_jiffies_relative(3*HZ));
+}
+
+static DEFINE_PER_CPU(struct delayed_work, cache_trim_work);
+
+static void __cpuinit start_cpu_timer(int cpu)
+{
+ struct delayed_work *cache_trim_work = &per_cpu(cache_trim_work, cpu);
+
+ /*
+ * When this gets called from do_initcalls via cpucache_init(),
+ * init_workqueues() has already run, so keventd will be setup
+ * at that time.
+ */
+ if (keventd_up() && cache_trim_work->work.func == NULL) {
+ INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
+ schedule_delayed_work_on(cpu, cache_trim_work,
+ __round_jiffies_relative(HZ, cpu));
+ }
+}
+
+static int __init cpucache_init(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ start_cpu_timer(cpu);
+
+ return 0;
+}
+device_initcall(cpucache_init);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+static void slab_mem_going_offline_callback(void *arg)
+{
+ kmem_cache_reap();
+}
+
+static void slab_mem_offline_callback(void *arg)
+{
+ /* XXX: should release structures, see CPU offline comment */
+}
+
+static int slab_mem_going_online_callback(void *arg)
+{
+ struct kmem_cache *s;
+ struct kmem_cache_node *n;
+ struct memory_notify *marg = arg;
+ int nid = marg->status_change_nid;
+ int ret = 0;
+
+ /*
+ * If the node's memory is already available, then kmem_cache_node is
+ * already created. Nothing to do.
+ */
+ if (nid < 0)
+ return 0;
+
+ /*
+ * We are bringing a node online. No memory is availabe yet. We must
+ * allocate a kmem_cache_node structure in order to bring the node
+ * online.
+ */
+ down_write(&slqb_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ /*
+ * XXX: kmem_cache_alloc_node will fallback to other nodes
+ * since memory is not yet available from the node that
+ * is brought up.
+ */
+ if (s->node_slab[nid]) /* could be lefover from last online */
+ continue;
+ n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ init_kmem_cache_node(s, n);
+ s->node_slab[nid] = n;
+ }
+out:
+ up_write(&slqb_lock);
+ return ret;
+}
+
+static int slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = slab_mem_going_online_callback(arg);
+ break;
+ case MEM_GOING_OFFLINE:
+ slab_mem_going_offline_callback(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ slab_mem_offline_callback(arg);
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+
+ ret = notifier_from_errno(ret);
+ return ret;
+}
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+/********************************************************************
+ * Basic setup of slabs
+ *******************************************************************/
+
+void __init kmem_cache_init(void)
+{
+ int i;
+ unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
+
+ /*
+ * All the ifdefs are rather ugly here, but it's just the setup code,
+ * so it doesn't have to be too readable :)
+ */
+ kmem_cache_open(&kmem_cache_cache, "kmem_cache",
+ sizeof(struct kmem_cache), 0, flags, NULL, 0);
+#ifdef CONFIG_SMP
+ kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
+ sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
+#endif
+#ifdef CONFIG_NUMA
+ kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
+ sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
+#endif
+
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct kmem_cache_cpu *c;
+
+ c = &per_cpu(kmem_cache_cpus, i);
+ init_kmem_cache_cpu(&kmem_cache_cache, c);
+ kmem_cache_cache.cpu_slab[i] = c;
+
+ c = &per_cpu(kmem_cpu_cpus, i);
+ init_kmem_cache_cpu(&kmem_cpu_cache, c);
+ kmem_cpu_cache.cpu_slab[i] = c;
+
+#ifdef CONFIG_NUMA
+ c = &per_cpu(kmem_node_cpus, i);
+ init_kmem_cache_cpu(&kmem_node_cache, c);
+ kmem_node_cache.cpu_slab[i] = c;
+#endif
+ }
+#else
+ init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
+#endif
+
+#ifdef CONFIG_NUMA
+ for_each_node_state(i, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+
+ n = &per_cpu(kmem_cache_nodes, i);
+ init_kmem_cache_node(&kmem_cache_cache, n);
+ kmem_cache_cache.node_slab[i] = n;
+#ifdef CONFIG_SMP
+ n = &per_cpu(kmem_cpu_nodes, i);
+ init_kmem_cache_node(&kmem_cpu_cache, n);
+ kmem_cpu_cache.node_slab[i] = n;
+#endif
+ n = &per_cpu(kmem_node_nodes, i);
+ init_kmem_cache_node(&kmem_node_cache, n);
+ kmem_node_cache.node_slab[i] = n;
+ }
+#endif
+
+ /* Caches that are not of the two-to-the-power-of size */
+ if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
+ open_kmalloc_cache(&kmalloc_caches[1],
+ "kmalloc-96", 96, GFP_KERNEL);
+#ifdef CONFIG_ZONE_DMA
+ open_kmalloc_cache(&kmalloc_caches_dma[1],
+ "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
+#endif
+ }
+ if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
+ open_kmalloc_cache(&kmalloc_caches[2],
+ "kmalloc-192", 192, GFP_KERNEL);
+#ifdef CONFIG_ZONE_DMA
+ open_kmalloc_cache(&kmalloc_caches_dma[2],
+ "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
+#endif
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
+ open_kmalloc_cache(&kmalloc_caches[i],
+ "kmalloc", 1 << i, GFP_KERNEL);
+#ifdef CONFIG_ZONE_DMA
+ open_kmalloc_cache(&kmalloc_caches_dma[i],
+ "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
+#endif
+ }
+
+ /*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * mips it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
+ size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+
+ /* Provide the correct kmalloc names now that the caches are up */
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
+ kmalloc_caches[i].name =
+ kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+#ifdef CONFIG_ZONE_DMA
+ kmalloc_caches_dma[i].name =
+ kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
+#endif
+ }
+
+#ifdef CONFIG_SMP
+ register_cpu_notifier(&slab_notifier);
+#endif
+#ifdef CONFIG_NUMA
+ hotplug_memory_notifier(slab_memory_callback, 1);
+#endif
+ /*
+ * smp_init() has not yet been called, so no worries about memory
+ * ordering with __slab_is_available.
+ */
+ __slab_is_available = 1;
+}
+
+void __init kmem_cache_init_late(void)
+{
+}
+
+/*
+ * Some basic slab creation sanity checks
+ */
+static int kmem_cache_create_ok(const char *name, size_t size,
+ size_t align, unsigned long flags)
+{
+ struct kmem_cache *tmp;
+
+ /*
+ * Sanity checks... these are all serious usage bugs.
+ */
+ if (!name || in_interrupt() || (size < sizeof(void *))) {
+ printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
+ name);
+ dump_stack();
+
+ return 0;
+ }
+
+ down_read(&slqb_lock);
+
+ list_for_each_entry(tmp, &slab_caches, list) {
+ char x;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ res = probe_kernel_address(tmp->name, x);
+ if (res) {
+ printk(KERN_ERR
+ "SLAB: cache with size %d has lost its name\n",
+ tmp->size);
+ continue;
+ }
+
+ if (!strcmp(tmp->name, name)) {
+ printk(KERN_ERR
+ "SLAB: duplicate cache %s\n", name);
+ dump_stack();
+ up_read(&slqb_lock);
+
+ return 0;
+ }
+ }
+
+ up_read(&slqb_lock);
+
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+ if (flags & SLAB_DESTROY_BY_RCU)
+ WARN_ON(flags & SLAB_POISON);
+
+ return 1;
+}
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ if (!kmem_cache_create_ok(name, size, align, flags))
+ goto err;
+
+ s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
+ if (!s)
+ goto err;
+
+ if (kmem_cache_open(s, name, size, align, flags, ctor, 1))
+ return s;
+
+ kmem_cache_free(&kmem_cache_cache, s);
+
+err:
+ if (flags & SLAB_PANIC)
+ panic("%s: failed to create slab `%s'\n", __func__, name);
+
+ return NULL;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
+ */
+static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct kmem_cache *s;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ down_write(&slqb_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ if (s->cpu_slab[cpu]) /* could be lefover last online */
+ continue;
+ s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
+ if (!s->cpu_slab[cpu]) {
+ up_read(&slqb_lock);
+ return NOTIFY_BAD;
+ }
+ }
+ up_write(&slqb_lock);
+ break;
+
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ cancel_rearming_delayed_work(&per_cpu(cache_trim_work, cpu));
+ per_cpu(cache_trim_work, cpu).work.func = NULL;
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ /*
+ * XXX: Freeing here doesn't work because objects can still be
+ * on this CPU's list. periodic timer needs to check if a CPU
+ * is offline and then try to cleanup from there. Same for node
+ * offline.
+ */
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata slab_notifier = {
+ .notifier_call = slab_cpuup_callback
+};
+
+#endif
+
+#ifdef CONFIG_SLQB_DEBUG
+void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+{
+ struct kmem_cache *s;
+ int node = -1;
+
+ s = get_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+#ifdef CONFIG_NUMA
+ if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+ node = alternate_nid(s, flags, node);
+#endif
+ return slab_alloc(s, flags, node, caller);
+}
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+ unsigned long caller)
+{
+ struct kmem_cache *s;
+
+ s = get_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ return slab_alloc(s, flags, node, caller);
+}
+#endif
+
+#if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
+struct stats_gather {
+ struct kmem_cache *s;
+ spinlock_t lock;
+ unsigned long nr_slabs;
+ unsigned long nr_partial;
+ unsigned long nr_inuse;
+ unsigned long nr_objects;
+
+#ifdef CONFIG_SLQB_STATS
+ unsigned long stats[NR_SLQB_STAT_ITEMS];
+#endif
+};
+
+static void __gather_stats(void *arg)
+{
+ unsigned long nr_slabs;
+ unsigned long nr_partial;
+ unsigned long nr_inuse;
+ struct stats_gather *gather = arg;
+ int cpu = smp_processor_id();
+ struct kmem_cache *s = gather->s;
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_list *l = &c->list;
+ struct slqb_page *page;
+#ifdef CONFIG_SLQB_STATS
+ int i;
+#endif
+
+ spin_lock(&l->page_lock);
+ nr_slabs = l->nr_slabs;
+ nr_partial = l->nr_partial;
+ nr_inuse = (nr_slabs - nr_partial) * s->objects;
+
+ list_for_each_entry(page, &l->partial, lru) {
+ nr_inuse += page->inuse;
+ }
+ spin_unlock(&l->page_lock);
+
+ spin_lock(&gather->lock);
+ gather->nr_slabs += nr_slabs;
+ gather->nr_partial += nr_partial;
+ gather->nr_inuse += nr_inuse;
+#ifdef CONFIG_SLQB_STATS
+ for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
+ gather->stats[i] += l->stats[i];
+#endif
+ spin_unlock(&gather->lock);
+}
+
+/* must be called with slqb_lock held */
+static void gather_stats_locked(struct kmem_cache *s,
+ struct stats_gather *stats)
+{
+#ifdef CONFIG_NUMA
+ int node;
+#endif
+
+ memset(stats, 0, sizeof(struct stats_gather));
+ stats->s = s;
+ spin_lock_init(&stats->lock);
+
+ on_each_cpu(__gather_stats, stats, 1);
+
+#ifdef CONFIG_NUMA
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = s->node_slab[node];
+ struct kmem_cache_list *l = &n->list;
+ struct slqb_page *page;
+ unsigned long flags;
+#ifdef CONFIG_SLQB_STATS
+ int i;
+#endif
+
+ spin_lock_irqsave(&n->list_lock, flags);
+#ifdef CONFIG_SLQB_STATS
+ for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
+ stats->stats[i] += l->stats[i];
+#endif
+ stats->nr_slabs += l->nr_slabs;
+ stats->nr_partial += l->nr_partial;
+ stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
+
+ list_for_each_entry(page, &l->partial, lru) {
+ stats->nr_inuse += page->inuse;
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+#endif
+
+ stats->nr_objects = stats->nr_slabs * s->objects;
+}
+
+#ifdef CONFIG_SLQB_SYSFS
+static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
+{
+ down_read(&slqb_lock); /* hold off hotplug */
+ gather_stats_locked(s, stats);
+ up_read(&slqb_lock);
+}
+#endif
+#endif
+
+/*
+ * The /proc/slabinfo ABI
+ */
+#ifdef CONFIG_SLABINFO
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+ size_t count, loff_t *ppos)
+{
+ return -EINVAL;
+}
+
+static void print_slabinfo_header(struct seq_file *m)
+{
+ seq_puts(m, "slabinfo - version: 2.1\n");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ down_read(&slqb_lock);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&slab_caches, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ up_read(&slqb_lock);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct stats_gather stats;
+ struct kmem_cache *s;
+
+ s = list_entry(p, struct kmem_cache, list);
+
+ gather_stats_locked(s, &stats);
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
+ stats.nr_objects, s->size, s->objects, (1 << s->order));
+ seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
+ slab_freebatch(s), 0);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
+ stats.nr_slabs, 0UL);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+ .open = slabinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
+ &proc_slabinfo_operations);
+ return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
+
+#ifdef CONFIG_SLQB_SYSFS
+/*
+ * sysfs API
+ */
+#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
+#define to_slab(n) container_of(n, struct kmem_cache, kobj);
+
+struct slab_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kmem_cache *s, char *buf);
+ ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
+};
+
+#define SLAB_ATTR_RO(_name) \
+ static struct slab_attribute _name##_attr = __ATTR_RO(_name)
+
+#define SLAB_ATTR(_name) \
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->size);
+}
+SLAB_ATTR_RO(slab_size);
+
+static ssize_t align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->align);
+}
+SLAB_ATTR_RO(align);
+
+static ssize_t object_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->objsize);
+}
+SLAB_ATTR_RO(object_size);
+
+static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->objects);
+}
+SLAB_ATTR_RO(objs_per_slab);
+
+static ssize_t order_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->order);
+}
+SLAB_ATTR_RO(order);
+
+static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+{
+ if (s->ctor) {
+ int n = sprint_symbol(buf, (unsigned long)s->ctor);
+
+ return n + sprintf(buf + n, "\n");
+ }
+ return 0;
+}
+SLAB_ATTR_RO(ctor);
+
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
+{
+ struct stats_gather stats;
+
+ gather_stats(s, &stats);
+
+ return sprintf(buf, "%lu\n", stats.nr_slabs);
+}
+SLAB_ATTR_RO(slabs);
+
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+ struct stats_gather stats;
+
+ gather_stats(s, &stats);
+
+ return sprintf(buf, "%lu\n", stats.nr_inuse);
+}
+SLAB_ATTR_RO(objects);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+ struct stats_gather stats;
+
+ gather_stats(s, &stats);
+
+ return sprintf(buf, "%lu\n", stats.nr_objects);
+}
+SLAB_ATTR_RO(total_objects);
+
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+}
+SLAB_ATTR_RO(reclaim_account);
+
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+}
+SLAB_ATTR_RO(hwcache_align);
+
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+}
+SLAB_ATTR_RO(cache_dma);
+#endif
+
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+}
+SLAB_ATTR_RO(destroy_by_rcu);
+
+static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+}
+SLAB_ATTR_RO(red_zone);
+
+static ssize_t poison_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+}
+SLAB_ATTR_RO(poison);
+
+static ssize_t store_user_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+}
+SLAB_ATTR_RO(store_user);
+
+static ssize_t hiwater_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ long hiwater;
+ int err;
+
+ err = strict_strtol(buf, 10, &hiwater);
+ if (err)
+ return err;
+
+ if (hiwater < 0)
+ return -EINVAL;
+
+ s->hiwater = hiwater;
+
+ return length;
+}
+
+static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", slab_hiwater(s));
+}
+SLAB_ATTR(hiwater);
+
+static ssize_t freebatch_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ long freebatch;
+ int err;
+
+ err = strict_strtol(buf, 10, &freebatch);
+ if (err)
+ return err;
+
+ if (freebatch <= 0 || freebatch - 1 > s->hiwater)
+ return -EINVAL;
+
+ s->freebatch = freebatch;
+
+ return length;
+}
+
+static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", slab_freebatch(s));
+}
+SLAB_ATTR(freebatch);
+
+#ifdef CONFIG_SLQB_STATS
+static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
+{
+ struct stats_gather stats;
+ int len;
+#ifdef CONFIG_SMP
+ int cpu;
+#endif
+
+ gather_stats(s, &stats);
+
+ len = sprintf(buf, "%lu", stats.stats[si]);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ struct kmem_cache_list *l = &c->list;
+
+ if (len < PAGE_SIZE - 20)
+ len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
+ }
+#endif
+ return len + sprintf(buf + len, "\n");
+}
+
+#define STAT_ATTR(si, text) \
+static ssize_t text##_show(struct kmem_cache *s, char *buf) \
+{ \
+ return show_stat(s, buf, si); \
+} \
+SLAB_ATTR_RO(text); \
+
+STAT_ATTR(ALLOC, alloc);
+STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
+STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
+STAT_ATTR(FREE, free);
+STAT_ATTR(FREE_REMOTE, free_remote);
+STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
+STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
+STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
+STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
+STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
+STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
+STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
+STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
+STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
+#endif
+
+static struct attribute *slab_attrs[] = {
+ &slab_size_attr.attr,
+ &object_size_attr.attr,
+ &objs_per_slab_attr.attr,
+ &order_attr.attr,
+ &objects_attr.attr,
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &ctor_attr.attr,
+ &align_attr.attr,
+ &hwcache_align_attr.attr,
+ &reclaim_account_attr.attr,
+ &destroy_by_rcu_attr.attr,
+ &red_zone_attr.attr,
+ &poison_attr.attr,
+ &store_user_attr.attr,
+ &hiwater_attr.attr,
+ &freebatch_attr.attr,
+#ifdef CONFIG_ZONE_DMA
+ &cache_dma_attr.attr,
+#endif
+#ifdef CONFIG_SLQB_STATS
+ &alloc_attr.attr,
+ &alloc_slab_fill_attr.attr,
+ &alloc_slab_new_attr.attr,
+ &free_attr.attr,
+ &free_remote_attr.attr,
+ &flush_free_list_attr.attr,
+ &flush_free_list_objects_attr.attr,
+ &flush_free_list_remote_attr.attr,
+ &flush_slab_partial_attr.attr,
+ &flush_slab_free_attr.attr,
+ &flush_rfree_list_attr.attr,
+ &flush_rfree_list_objects_attr.attr,
+ &claim_remote_list_attr.attr,
+ &claim_remote_list_objects_attr.attr,
+#endif
+ NULL
+};
+
+static struct attribute_group slab_attr_group = {
+ .attrs = slab_attrs,
+};
+
+static ssize_t slab_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ err = attribute->show(s, buf);
+
+ return err;
+}
+
+static ssize_t slab_attr_store(struct kobject *kobj,
+ struct attribute *attr, const char *buf, size_t len)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ err = attribute->store(s, buf, len);
+
+ return err;
+}
+
+static void kmem_cache_release(struct kobject *kobj)
+{
+ struct kmem_cache *s = to_slab(kobj);
+
+ kmem_cache_free(&kmem_cache_cache, s);
+}
+
+static struct sysfs_ops slab_sysfs_ops = {
+ .show = slab_attr_show,
+ .store = slab_attr_store,
+};
+
+static struct kobj_type slab_ktype = {
+ .sysfs_ops = &slab_sysfs_ops,
+ .release = kmem_cache_release
+};
+
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+ struct kobj_type *ktype = get_ktype(kobj);
+
+ if (ktype == &slab_ktype)
+ return 1;
+ return 0;
+}
+
+static struct kset_uevent_ops slab_uevent_ops = {
+ .filter = uevent_filter,
+};
+
+static struct kset *slab_kset;
+
+static int sysfs_available __read_mostly;
+
+static int sysfs_slab_add(struct kmem_cache *s)
+{
+ int err;
+
+ if (!sysfs_available)
+ return 0;
+
+ s->kobj.kset = slab_kset;
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
+ if (err) {
+ kobject_put(&s->kobj);
+ return err;
+ }
+
+ err = sysfs_create_group(&s->kobj, &slab_attr_group);
+ if (err)
+ return err;
+
+ kobject_uevent(&s->kobj, KOBJ_ADD);
+
+ return 0;
+}
+
+static void sysfs_slab_remove(struct kmem_cache *s)
+{
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
+static int __init slab_sysfs_init(void)
+{
+ struct kmem_cache *s;
+ int err;
+
+ slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ if (!slab_kset) {
+ printk(KERN_ERR "Cannot register slab subsystem.\n");
+ return -ENOSYS;
+ }
+
+ down_write(&slqb_lock);
+
+ sysfs_available = 1;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ err = sysfs_slab_add(s);
+ if (err)
+ printk(KERN_ERR "SLQB: Unable to add boot slab %s"
+ " to sysfs\n", s->name);
+ }
+
+ up_write(&slqb_lock);
+
+ return 0;
+}
+device_initcall(slab_sysfs_init);
+
+#endif
diff --git a/mm/slub.c b/mm/slub.c
index b9f1491a58a1..6cdae26a82fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -141,6 +141,13 @@
SLAB_POISON | SLAB_STORE_USER)
/*
+ * Debugging flags that require metadata to be stored in the slab, up to
+ * DEBUG_SIZE in size.
+ */
+#define DEBUG_SIZE_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#define DEBUG_SIZE (3 * sizeof(void *) + 2 * sizeof(struct track))
+
+/*
* Set of flags that will prevent slab merging
*/
#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
@@ -325,6 +332,7 @@ static int slub_debug;
#endif
static char *slub_debug_slabs;
+static int disable_higher_order_debug;
/*
* Object debugging
@@ -976,6 +984,15 @@ static int __init setup_slub_debug(char *str)
*/
goto check_slabs;
+ if (tolower(*str) == 'o') {
+ /*
+ * Avoid enabling debugging on caches if its minimum order
+ * would increase as a result.
+ */
+ disable_higher_order_debug = 1;
+ goto out;
+ }
+
slub_debug = 0;
if (*str == '-')
/*
@@ -1022,13 +1039,27 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
void (*ctor)(void *))
{
+ int debug_flags = slub_debug;
+
/*
* Enable debugging if selected on the kernel commandline.
*/
- if (slub_debug && (!slub_debug_slabs ||
- strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
- flags |= slub_debug;
+ if (debug_flags) {
+ if (slub_debug_slabs &&
+ strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))
+ goto out;
+
+ /*
+ * Disable debugging that increases slab size if the minimum
+ * slab order would have increased as a result.
+ */
+ if (disable_higher_order_debug &&
+ get_order(objsize + DEBUG_SIZE) > get_order(objsize))
+ debug_flags &= ~DEBUG_SIZE_FLAGS;
+ flags |= debug_flags;
+ }
+out:
return flags;
}
#else
@@ -1560,6 +1591,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
"default order: %d, min order: %d\n", s->name, s->objsize,
s->size, oo_order(s->oo), oo_order(s->min));
+ if (oo_order(s->min) > get_order(s->objsize))
+ printk(KERN_WARNING " %s debugging increased min order, use "
+ "slub_debug=O to disable.\n", s->name);
+
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
unsigned long nr_slabs;
@@ -2091,8 +2126,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
*/
#define NR_KMEM_CACHE_CPU 100
-static DEFINE_PER_CPU(struct kmem_cache_cpu,
- kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+ kmem_cache_cpu);
static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
};
static struct backing_dev_info swap_backing_dev_info = {
+ .name = "swap",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
.unplug_io_fn = swap_unplug_io_fn,
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..fe8e986bb6a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1715,7 +1715,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (total_scanned > sc->swap_cluster_max +
sc->swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}