summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <koverstreet@google.com>2013-06-12 19:12:36 -0700
committerKent Overstreet <koverstreet@google.com>2013-06-14 15:58:30 -0700
commit37afdfd7ecbb4d07b176b61f7f80502ae475f3ad (patch)
tree666ea3ab9c44f84cdfdf4a1308499ba05dd155b7
parent9db900062844e23377ee76b69554ade1ae41ae3b (diff)
Percpu tag allocator
Allocates integers out of a predefined range [0..nr_tags) - for use by e.g. a driver to allocate tags for communicating with the device; typically these integers will be used as indices into a preallocated array of tag structs. Allocation is percpu. With percpu allocation (that works), it's impossible to guarantee it will always be possible to allocate all nr_tags - typically, some will be stuck on a remote percpu freelist where the current job can't get to them. We do guarantee that it will always be possible to allocate at least (nr_tags / 2) tags - this is done by keeping track of which and how many cpus have tags on their percpu freelists. On allocation failure if enough cpus have tags that there could potentially be (nr_tags / 2) tags stuck on remote percpu freelists, we then pick a remote cpu at random to steal from. Note that the synchronization is _definitely_ tricky - we're using xchg()/cmpxchg() on the percpu lists, to synchronize between steal_tags(). The alternative would've been adding a spinlock to protect the percpu freelists, but that would've required some different tricky code to avoid deadlock because of the lock ordering. Note that there's no cpu hotplug notifier - we don't care, because steal_tags() will eventually get the down cpu's tags. We _could_ satisfy more allocations if we had a notifier - but we'll still meet our guarantees and it's absolutely not a correctness issue, so I don't think it's worth the extra code. Signed-off-by: Kent Overstreet <koverstreet@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
-rw-r--r--include/linux/percpu-tags.h92
-rw-r--r--lib/Makefile2
-rw-r--r--lib/percpu-tags.c359
3 files changed, 452 insertions, 1 deletions
diff --git a/include/linux/percpu-tags.h b/include/linux/percpu-tags.h
new file mode 100644
index 000000000000..0f1e7b31d421
--- /dev/null
+++ b/include/linux/percpu-tags.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2012 Google Inc. All Rights Reserved.
+ * Author: koverstreet@google.com (Kent Overstreet)
+ *
+ * Per cpu tag allocator. Allocates small integers - up to nr_tags passed to
+ * percpu_tag_pool_init() - for use with say driver tag structures for talking
+ * to a device.
+ *
+ * It works by caching tags on percpu freelists, and then tags are
+ * allocated/freed from the global freelist in batches.
+ *
+ * Note that it will in general be impossible to allocate all nr_tags tags,
+ * since some tags will be stranded on other cpu's freelists: but we guarantee
+ * that nr_tags / 2 can always be allocated.
+ *
+ * This is done by keeping track of which cpus have tags on their percpu
+ * freelists in a bitmap, and then on allocation failure if too many cpus have
+ * tags on their freelists - i.e. if cpus_have_tags * TAG_CPU_SIZE (64) >
+ * nr_tags / 2 - then we steal one remote cpu's freelist (effectively picked at
+ * random).
+ *
+ * This means that if a workload spans a huge number of cpus - in relation to
+ * the number of tags that can be allocated - performance will suffer somewhat;
+ * but as long as the workload is bounded to a reasonable number of cpus the
+ * percpu-ness of the allocator will not be affected.
+ */
+
+#ifndef _LINUX_TAGS_H
+#define _LINUX_TAGS_H
+
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
+
+struct percpu_tag_cpu;
+
+struct percpu_tag_pool {
+ /*
+ * number of tags available to be allocated, as passed to
+ * percpu_tag_pool_init()
+ */
+ unsigned nr_tags;
+
+ struct percpu_tag_cpu __percpu *tag_cpu;
+
+ /*
+ * Bitmap of cpus that (may) have tags on their percpu freelists:
+ * steal_tags() uses this to decide when to steal tags, and which cpus
+ * to try stealing from.
+ *
+ * It's ok for a freelist to be empty when its bit is set - steal_tags()
+ * will just keep looking - but the bitmap _must_ be set whenever a
+ * percpu freelist does have tags.
+ */
+ unsigned long *cpus_have_tags;
+
+ struct {
+ spinlock_t lock;
+ /*
+ * When we go to steal tags from another cpu (see steal_tags()),
+ * we want to pick a cpu at random. Cycling through them every
+ * time we steal is a bit easier and more or less equivalent:
+ */
+ unsigned cpu_last_stolen;
+
+ /*
+ * Global freelist - it's a stack where nr_free points to the
+ * top
+ */
+ unsigned nr_free;
+ unsigned *freelist;
+
+ /* For sleeping on allocation failure */
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+};
+
+unsigned percpu_tag_alloc(struct percpu_tag_pool *pool, gfp_t gfp);
+void percpu_tag_free(struct percpu_tag_pool *pool, unsigned tag);
+
+void percpu_tag_pool_free(struct percpu_tag_pool *pool);
+int percpu_tag_pool_init(struct percpu_tag_pool *pool, unsigned long nr_tags);
+
+enum {
+ /*
+ * TAG_FAIL is returned on allocation failure, TAG_MAX is the max
+ * nr_tags you can pass to percpu_tag_pool_init()
+ */
+ TAG_FAIL = -1U,
+ TAG_MAX = TAG_FAIL - 1,
+};
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index c55a037a354e..e54a65094dec 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o
+ earlycpio.o percpu-tags.o
obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/percpu-tags.c b/lib/percpu-tags.c
new file mode 100644
index 000000000000..2c5d55e3c69e
--- /dev/null
+++ b/lib/percpu-tags.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright 2012 Google Inc. All Rights Reserved.
+ * Author: koverstreet@google.com (Kent Overstreet)
+ *
+ * Per cpu tag allocator.
+ */
+
+#include <asm/cmpxchg.h>
+#include <linux/bitmap.h>
+#include <linux/freezer.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/percpu-tags.h>
+
+/*
+ * Number of tags we move between the percpu freelist and the global freelist at
+ * a time
+ */
+#define TAG_CPU_BATCH_MOVE 32U
+
+/* Max size of percpu freelist, */
+#define TAG_CPU_SIZE (TAG_CPU_BATCH_MOVE * 2)
+
+/*
+ * When we're stealing tags from a remote cpu's freelist, we need to avoid
+ * racing with alloc/free on that cpu - steal_tags() will set nr_free on their
+ * freelist to TAG_CPU_STEALING while it copies tags away.
+ */
+#define TAG_CPU_STEALING UINT_MAX
+
+struct percpu_tag_cpu {
+ unsigned nr_free;
+ unsigned freelist[];
+};
+
+static inline void move_tags(unsigned *dst, unsigned *dst_nr,
+ unsigned *src, unsigned *src_nr,
+ unsigned nr)
+{
+ *src_nr -= nr;
+ memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
+ *dst_nr += nr;
+}
+
+/*
+ * Try to steal tags from a remote cpu's percpu freelist.
+ *
+ * We first check how many percpu freelists have tags - we don't steal tags
+ * unless enough percpu freelists have tags on them that it's possible more than
+ * half the total tags could be stuck on remote percpu freelists.
+ *
+ * Then we iterate through the cpus until we find some tags - we don't attempt
+ * to find the "best" cpu to steal from, to keep cacheline bouncing to a
+ * minimum.
+ *
+ * Returns true on success (our percpu freelist is no longer empty), false on
+ * failure.
+ */
+static inline bool steal_tags(struct percpu_tag_pool *pool,
+ struct percpu_tag_cpu *tags)
+{
+ unsigned nr_free, cpus_have_tags, cpu = pool->cpu_last_stolen;
+ struct percpu_tag_cpu *remote;
+
+ for (cpus_have_tags = bitmap_weight(pool->cpus_have_tags, nr_cpu_ids);
+ cpus_have_tags * TAG_CPU_SIZE > pool->nr_tags / 2;
+ cpus_have_tags--) {
+ cpu = find_next_bit(pool->cpus_have_tags, nr_cpu_ids, cpu);
+
+ if (cpu == nr_cpu_ids)
+ cpu = find_first_bit(pool->cpus_have_tags, nr_cpu_ids);
+
+ if (cpu == nr_cpu_ids)
+ BUG();
+
+ pool->cpu_last_stolen = cpu;
+ remote = per_cpu_ptr(pool->tag_cpu, cpu);
+
+ if (remote == tags)
+ continue;
+
+ clear_bit(cpu, pool->cpus_have_tags);
+
+ nr_free = xchg(&remote->nr_free, TAG_CPU_STEALING);
+
+ if (nr_free) {
+ memcpy(tags->freelist,
+ remote->freelist,
+ sizeof(unsigned) * nr_free);
+
+ /*
+ * Setting remote->nr_free is effectively unlock - so
+ * barrier between it and the memcpy(), corresponding to
+ * barrier in percpu_tag_free()
+ */
+ smp_mb();
+ remote->nr_free = 0;
+
+ tags->nr_free = nr_free;
+ return true;
+ } else {
+ remote->nr_free = 0;
+ }
+ }
+
+ return false;
+}
+
+static inline bool alloc_global_tags(struct percpu_tag_pool *pool,
+ struct percpu_tag_cpu *tags)
+{
+ if (pool->nr_free) {
+ move_tags(tags->freelist, &tags->nr_free,
+ pool->freelist, &pool->nr_free,
+ min(pool->nr_free, TAG_CPU_BATCH_MOVE));
+ return true;
+ }
+
+ return false;
+}
+
+static inline unsigned alloc_local_tag(struct percpu_tag_pool *pool,
+ struct percpu_tag_cpu *tags)
+{
+ unsigned nr_free, old, new, tag;
+
+ /*
+ * Try per cpu freelist
+ * Since we don't have global lock held, need to use cmpxchg()
+ * to guard against a different thread using steal_tags() on us:
+ */
+ nr_free = tags->nr_free;
+
+ do {
+ if (unlikely(!nr_free || nr_free == TAG_CPU_STEALING))
+ return TAG_FAIL;
+
+ old = nr_free;
+ new = old - 1;
+ tag = tags->freelist[new];
+
+ nr_free = cmpxchg(&tags->nr_free, old, new);
+ } while (unlikely(nr_free != old));
+
+ return tag;
+}
+
+/**
+ * percpu_tag_alloc - allocate a tag
+ * @pool: pool to allocate from
+ * @gfp: gfp flags
+ *
+ * Returns a tag - an integer in the range [0..nr_tags) (passed to
+ * tag_pool_init()), or otherwise TAG_FAIL on allocation failure.
+ *
+ * Safe to be called from interrupt context (assuming it isn't passed
+ * __GFP_WAIT, of course).
+ *
+ * Will not fail if passed __GFP_WAIT.
+ */
+unsigned percpu_tag_alloc(struct percpu_tag_pool *pool, gfp_t gfp)
+{
+ DEFINE_WAIT(wait);
+ struct percpu_tag_cpu *tags;
+ unsigned long flags;
+ unsigned tag, this_cpu;
+
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ tags = per_cpu_ptr(pool->tag_cpu, this_cpu);
+
+ /* Fastpath */
+ tag = alloc_local_tag(pool, tags);
+ if (likely(tag != TAG_FAIL)) {
+ local_irq_restore(flags);
+ return tag;
+ }
+
+ while (1) {
+ spin_lock(&pool->lock);
+
+ /*
+ * prepare_to_wait() must come before steal_tags(), in case
+ * percpu_tag_free() on another cpu flips a bit in
+ * cpus_have_tags
+ */
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ /*
+ * alloc_global_tags(), steal_tags() return true iff we now have
+ * tags on our percpu freelist
+ */
+ if (tags->nr_free ||
+ alloc_global_tags(pool, tags) ||
+ steal_tags(pool, tags)) {
+ /* Global lock held, don't need cmpxchg */
+ tag = tags->freelist[--tags->nr_free];
+ if (tags->nr_free)
+ set_bit(this_cpu, pool->cpus_have_tags);
+ }
+
+ spin_unlock(&pool->lock);
+ local_irq_restore(flags);
+
+ if (tag != TAG_FAIL || !(gfp & __GFP_WAIT))
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ tags = per_cpu_ptr(pool->tag_cpu, this_cpu);
+ }
+
+ finish_wait(&pool->wait, &wait);
+ return tag;
+}
+EXPORT_SYMBOL_GPL(percpu_tag_alloc);
+
+/**
+ * percpu_tag_free - free a tag
+ * @pool: pool @tag was allocated from
+ * @tag: a tag previously allocated with percpu_tag_alloc()
+ *
+ * Safe to be called from interrupt context.
+ */
+void percpu_tag_free(struct percpu_tag_pool *pool, unsigned tag)
+{
+ struct percpu_tag_cpu *tags;
+ unsigned long flags;
+ unsigned nr_free, old, new, this_cpu;
+
+ BUG_ON(tag >= pool->nr_tags);
+
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ tags = per_cpu_ptr(pool->tag_cpu, this_cpu);
+
+ /*
+ * Need to guard against racing with steal_tags() on another cpu - we
+ * can manage with just cmpxchg because we can only race with tags being
+ * pulled off our freelist, not other threads pushing tags back onto our
+ * freelist
+ */
+ nr_free = tags->nr_free;
+
+ do {
+ while (unlikely(nr_free == TAG_CPU_STEALING)) {
+ cpu_relax();
+ nr_free = tags->nr_free;
+ }
+
+ smp_mb();
+
+ old = nr_free;
+ new = old + 1;
+ tags->freelist[old] = tag;
+
+ nr_free = cmpxchg(&tags->nr_free, old, new);
+ } while (unlikely(nr_free != old));
+
+ if (!nr_free) {
+ set_bit(this_cpu, pool->cpus_have_tags);
+ wake_up(&pool->wait);
+ }
+
+ if (new == TAG_CPU_SIZE) {
+ spin_lock(&pool->lock);
+ /* Might have had our tags stolen before we took global lock */
+ if (tags->nr_free == TAG_CPU_SIZE) {
+ move_tags(pool->freelist, &pool->nr_free,
+ tags->freelist, &tags->nr_free,
+ TAG_CPU_BATCH_MOVE);
+
+ wake_up(&pool->wait);
+ }
+ spin_unlock(&pool->lock);
+ }
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(percpu_tag_free);
+
+/**
+ * percpu_tag_pool_free - release a tag pool's resources
+ * @pool: pool to free
+ *
+ * Frees the resources allocated by percpu_tag_pool_init().
+ */
+void percpu_tag_pool_free(struct percpu_tag_pool *pool)
+{
+ free_percpu(pool->tag_cpu);
+ kfree(pool->cpus_have_tags);
+ free_pages((unsigned long) pool->freelist,
+ get_order(pool->nr_tags * sizeof(unsigned)));
+}
+EXPORT_SYMBOL_GPL(percpu_tag_pool_free);
+
+/**
+ * percpu_tag_pool_init - initialize a percpu tag pool
+ * @pool: pool to initialize
+ * @nr_tags: number of tags that will be available for allocation
+ *
+ * Initializes @pool so that it can be used to allocate tags - integers in the
+ * range [0, nr_tags). Typically, they'll be used by driver code to refer to a
+ * preallocated array of tag structures.
+ *
+ * Allocation is percpu, but sharding is limited by nr_tags - for best
+ * performance, the workload should not span more cpus than nr_tags / 128.
+ */
+int percpu_tag_pool_init(struct percpu_tag_pool *pool, unsigned long nr_tags)
+{
+ unsigned i, order;
+
+ memset(pool, 0, sizeof(*pool));
+
+ spin_lock_init(&pool->lock);
+ init_waitqueue_head(&pool->wait);
+ pool->nr_tags = nr_tags;
+
+ /* Guard against overflow */
+ if (nr_tags > TAG_MAX) {
+ pr_err("tags.c: nr_tags too large\n");
+ return -EINVAL;
+ }
+
+ order = get_order(nr_tags * sizeof(unsigned));
+ pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
+ if (!pool->freelist)
+ goto err;
+
+ for (i = 0; i < nr_tags; i++)
+ pool->freelist[i] = i;
+
+ pool->nr_free = nr_tags;
+
+ pool->cpus_have_tags = kzalloc(BITS_TO_LONGS(nr_cpu_ids) *
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!pool->cpus_have_tags)
+ goto err;
+
+ pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_tag_cpu) +
+ TAG_CPU_SIZE * sizeof(unsigned),
+ sizeof(unsigned));
+ if (!pool->tag_cpu)
+ goto err;
+
+ return 0;
+err:
+ percpu_tag_pool_free(pool);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(percpu_tag_pool_init);