diff options
Diffstat (limited to 'lib')
53 files changed, 3190 insertions, 622 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec..260a80e313b9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -457,9 +457,6 @@ config NLATTR config GENERIC_ATOMIC64 bool -config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - def_bool y if GENERIC_ATOMIC64 - config LRU_CACHE tristate @@ -550,4 +547,7 @@ config STACKDEPOT bool select STACKTRACE +config SBITMAP + bool + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b9cfdbfae9aa..b01e547d4d04 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -198,6 +198,7 @@ config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 default 0 if KASAN + default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 1024 if !64BIT default 2048 if 64BIT help @@ -244,6 +245,7 @@ config PAGE_OWNER depends on DEBUG_KERNEL && STACKTRACE_SUPPORT select DEBUG_FS select STACKTRACE + select STACKDEPOT select PAGE_EXTENSION help This keeps track of what call chain is the owner of a page, may @@ -304,7 +306,7 @@ config DEBUG_SECTION_MISMATCH a larger kernel). - Run the section mismatch analysis for each module/built-in.o file. When we run the section mismatch analysis on vmlinux.o, we - lose valueble information about where the mismatch was + lose valuable information about where the mismatch was introduced. Running the analysis for each module/built-in.o file tells where the mismatch happens much closer to the @@ -708,6 +710,8 @@ config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV select DEBUG_FS + select GCC_PLUGINS if !COMPILE_TEST + select GCC_PLUGIN_SANCOV if !COMPILE_TEST help KCOV exposes kernel code coverage information in a form suitable for coverage-guided fuzzing (randomized testing). @@ -718,6 +722,17 @@ config KCOV For more details, see Documentation/kcov.txt. +config KCOV_INSTRUMENT_ALL + bool "Instrument all code by default" + depends on KCOV + default y if KCOV + help + If you are doing generic system call fuzzing (like e.g. syzkaller), + then you will want to instrument the whole kernel and you should + say y here. If you are doing more targeted fuzzing (like e.g. + filesystem fuzzing with AFL) then you will want to enable coverage + for more specific subsets of files, and should say n here. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL @@ -807,7 +822,7 @@ config DETECT_HUNG_TASK help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in - uninterruptible "D" state indefinitiley. + uninterruptible "D" state indefinitely. When a hung task is detected, the kernel will print the current stack trace (which you should report), but the @@ -1307,22 +1322,6 @@ config RCU_PERF_TEST Say M if you want the RCU performance tests to build as a module. Say N if you are unsure. -config RCU_PERF_TEST_RUNNABLE - bool "performance tests for RCU runnable by default" - depends on RCU_PERF_TEST = y - default n - help - This option provides a way to build the RCU performance tests - directly into the kernel without them starting up at boot time. - You can use /sys/module to manually override this setting. - This /proc file is available only when the RCU performance - tests have been built into the kernel. - - Say Y here if you want the RCU performance tests to start during - boot (you probably don't). - Say N here if you want the RCU performance tests to start only - after being manually enabled via /sys/module. - config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL @@ -1340,23 +1339,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_RUNNABLE - bool "torture tests for RCU runnable by default" - depends on RCU_TORTURE_TEST = y - default n - help - This option provides a way to build the RCU torture tests - directly into the kernel without them starting up at boot - time. You can use /proc/sys/kernel/rcutorture_runnable - to manually override this setting. This /proc file is - available only when the RCU torture tests have been built - into the kernel. - - Say Y here if you want the RCU torture tests to start during - boot (you probably don't). - Say N here if you want the RCU torture tests to start only - after being manually enabled via /proc. - config RCU_TORTURE_TEST_SLOW_PREINIT bool "Slow down RCU grace-period pre-initialization to expose races" depends on RCU_TORTURE_TEST @@ -1705,24 +1687,6 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. -config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - bool - -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict user copy size checks" - depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - help - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, say N. - source kernel/trace/Kconfig menu "Runtime Testing" @@ -1894,15 +1858,6 @@ config PROVIDE_OHCI1394_DMA_INIT See Documentation/debugging-via-ohci1394.txt for more information. -config BUILD_DOCSRC - bool "Build targets in Documentation/ tree" - depends on HEADERS_CHECK - help - This option attempts to build objects from the source files in the - kernel Documentation/ tree. - - Say N if you are unsure. - config DMA_API_DEBUG bool "Enable debugging of DMA-API usage" depends on HAVE_DMA_API_DEBUG diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 67d8c6838ba9..bd38aab05929 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN config KASAN bool "KASan: runtime memory debugger" - depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) + depends on SLUB || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS - select STACKDEPOT if SLAB + select STACKDEPOT help Enables kernel address sanitizer - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 39494af9a84a..bc6e651df68c 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -1,6 +1,9 @@ config ARCH_HAS_UBSAN_SANITIZE_ALL bool +config ARCH_WANTS_UBSAN_NO_NULL + def_bool n + config UBSAN bool "Undefined behaviour sanity checker" help @@ -34,3 +37,11 @@ config UBSAN_ALIGNMENT This option enables detection of unaligned memory accesses. Enabling this option on architectures that support unaligned accesses may produce a lot of false positives. + +config UBSAN_NULL + bool "Enable checking of null pointers" + depends on UBSAN + default y if !ARCH_WANTS_UBSAN_NO_NULL + help + This option enables detection of memory accesses via a + null pointer. diff --git a/lib/Makefile b/lib/Makefile index ff6a7a6c6395..50144a3aeebd 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,19 +15,15 @@ KCOV_INSTRUMENT_rbtree.o := n KCOV_INSTRUMENT_list_debug.o := n KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n -# Kernel does not boot if we instrument this file as it uses custom calling -# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). -KCOV_INSTRUMENT_hweight.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ - sha1.o md5.o irq_regs.o argv_split.o \ + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o -obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-$(CONFIG_HAS_DMA) += dma-noop.o @@ -74,8 +70,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o @@ -186,6 +180,7 @@ obj-$(CONFIG_IRQ_POLL) += irq_poll.o obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +KCOV_INSTRUMENT_stackdepot.o := n libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o @@ -233,3 +228,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n + +obj-$(CONFIG_SBITMAP) += sbitmap.o diff --git a/lib/atomic64.c b/lib/atomic64.c index 2886ebac6567..53c2d5edc826 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -96,17 +96,41 @@ long long atomic64_##op##_return(long long a, atomic64_t *v) \ } \ EXPORT_SYMBOL(atomic64_##op##_return); +#define ATOMIC64_FETCH_OP(op, c_op) \ +long long atomic64_fetch_##op(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + long long val; \ + \ + raw_spin_lock_irqsave(lock, flags); \ + val = v->counter; \ + v->counter c_op a; \ + raw_spin_unlock_irqrestore(lock, flags); \ + return val; \ +} \ +EXPORT_SYMBOL(atomic64_fetch_##op); + #define ATOMIC64_OPS(op, c_op) \ ATOMIC64_OP(op, c_op) \ - ATOMIC64_OP_RETURN(op, c_op) + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) ATOMIC64_OPS(add, +=) ATOMIC64_OPS(sub, -=) -ATOMIC64_OP(and, &=) -ATOMIC64_OP(or, |=) -ATOMIC64_OP(xor, ^=) #undef ATOMIC64_OPS +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(and, &=) +ATOMIC64_OPS(or, |=) +ATOMIC64_OPS(xor, ^=) + +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 123481814320..46042901130f 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -53,11 +53,25 @@ do { \ BUG_ON(atomic##bit##_read(&v) != r); \ } while (0) +#define TEST_FETCH(bit, op, c_op, val) \ +do { \ + atomic##bit##_set(&v, v0); \ + r = v0; \ + r c_op val; \ + BUG_ON(atomic##bit##_##op(val, &v) != v0); \ + BUG_ON(atomic##bit##_read(&v) != r); \ +} while (0) + #define RETURN_FAMILY_TEST(bit, op, c_op, val) \ do { \ FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \ } while (0) +#define FETCH_FAMILY_TEST(bit, op, c_op, val) \ +do { \ + FAMILY_TEST(TEST_FETCH, bit, op, c_op, val); \ +} while (0) + #define TEST_ARGS(bit, op, init, ret, expect, args...) \ do { \ atomic##bit##_set(&v, init); \ @@ -114,6 +128,16 @@ static __init void test_atomic(void) RETURN_FAMILY_TEST(, sub_return, -=, onestwos); RETURN_FAMILY_TEST(, sub_return, -=, -one); + FETCH_FAMILY_TEST(, fetch_add, +=, onestwos); + FETCH_FAMILY_TEST(, fetch_add, +=, -one); + FETCH_FAMILY_TEST(, fetch_sub, -=, onestwos); + FETCH_FAMILY_TEST(, fetch_sub, -=, -one); + + FETCH_FAMILY_TEST(, fetch_or, |=, v1); + FETCH_FAMILY_TEST(, fetch_and, &=, v1); + FETCH_FAMILY_TEST(, fetch_andnot, &= ~, v1); + FETCH_FAMILY_TEST(, fetch_xor, ^=, v1); + INC_RETURN_FAMILY_TEST(, v0); DEC_RETURN_FAMILY_TEST(, v0); @@ -154,6 +178,16 @@ static __init void test_atomic64(void) RETURN_FAMILY_TEST(64, sub_return, -=, onestwos); RETURN_FAMILY_TEST(64, sub_return, -=, -one); + FETCH_FAMILY_TEST(64, fetch_add, +=, onestwos); + FETCH_FAMILY_TEST(64, fetch_add, +=, -one); + FETCH_FAMILY_TEST(64, fetch_sub, -=, onestwos); + FETCH_FAMILY_TEST(64, fetch_sub, -=, -one); + + FETCH_FAMILY_TEST(64, fetch_or, |=, v1); + FETCH_FAMILY_TEST(64, fetch_and, &=, v1); + FETCH_FAMILY_TEST(64, fetch_andnot, &= ~, v1); + FETCH_FAMILY_TEST(64, fetch_xor, ^=, v1); + INIT(v0); atomic64_inc(&v); r += one; @@ -179,7 +213,6 @@ static __init void test_atomic64(void) r += one; BUG_ON(v.counter != r); -#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE INIT(onestwos); BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); r -= one; @@ -192,9 +225,6 @@ static __init void test_atomic64(void) INIT(-one); BUG_ON(atomic64_dec_if_positive(&v) != (-one - one)); BUG_ON(v.counter != r); -#else -#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol -#endif INIT(onestwos); BUG_ON(!atomic64_inc_not_zero(&v)); diff --git a/lib/bitmap.c b/lib/bitmap.c index c66da508cbf7..0b66f0e5eb6b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -14,9 +14,9 @@ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <asm/page.h> -#include <asm/uaccess.h> /* * bitmaps provide an array of bits, implemented using an an @@ -496,6 +496,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. + * Optionally each range can be postfixed to denote that only parts of it + * should be set. The range will divided to groups of specific size. + * From each group will be used only defined amount of bits. + * Syntax: range:used_size/group_size + * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * * Returns 0 on success, -errno on invalid input strings. * Error values: @@ -507,16 +512,20 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int is_user, unsigned long *maskp, int nmaskbits) { - unsigned a, b; + unsigned int a, b, old_a, old_b; + unsigned int group_size, used_size; int c, old_c, totaldigits, ndigits; const char __user __force *ubuf = (const char __user __force *)buf; - int at_start, in_range; + int at_start, in_range, in_partial_range; totaldigits = c = 0; + old_a = old_b = 0; + group_size = used_size = 0; bitmap_zero(maskp, nmaskbits); do { at_start = 1; in_range = 0; + in_partial_range = 0; a = b = 0; ndigits = totaldigits; @@ -547,6 +556,24 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, if ((totaldigits != ndigits) && isspace(old_c)) return -EINVAL; + if (c == '/') { + used_size = a; + at_start = 1; + in_range = 0; + a = b = 0; + continue; + } + + if (c == ':') { + old_a = a; + old_b = b; + at_start = 1; + in_range = 0; + in_partial_range = 1; + a = b = 0; + continue; + } + if (c == '-') { if (at_start || in_range) return -EINVAL; @@ -567,15 +594,30 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, } if (ndigits == totaldigits) continue; + if (in_partial_range) { + group_size = a; + a = old_a; + b = old_b; + old_a = old_b = 0; + } /* if no digit is after '-', it's wrong*/ if (at_start && in_range) return -EINVAL; - if (!(a <= b)) + if (!(a <= b) || !(used_size <= group_size)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; while (a <= b) { - set_bit(a, maskp); + if (in_partial_range) { + static int pos_in_group = 1; + + if (pos_in_group <= used_size) + set_bit(a, maskp); + + if (a == b || ++pos_in_group > group_size) + pos_in_group = 1; + } else + set_bit(a, maskp); a++; } } while (buflen && c == ','); diff --git a/lib/chacha20.c b/lib/chacha20.c new file mode 100644 index 000000000000..250ceed9ec9a --- /dev/null +++ b/lib/chacha20.c @@ -0,0 +1,79 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/bitops.h> +#include <linux/cryptohash.h> +#include <asm/unaligned.h> +#include <crypto/chacha20.h> + +static inline u32 rotl32(u32 v, u8 n) +{ + return (v << n) | (v >> (sizeof(v) * 8 - n)); +} + +extern void chacha20_block(u32 *state, void *stream) +{ + u32 x[16], *out = stream; + int i; + + for (i = 0; i < ARRAY_SIZE(x); i++) + x[i] = state[i]; + + for (i = 0; i < 20; i += 2) { + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); + + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); + } + + for (i = 0; i < ARRAY_SIZE(x); i++) + out[i] = cpu_to_le32(x[i] + state[i]); + + state[12]++; +} +EXPORT_SYMBOL(chacha20_block); diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c index 707ca24f7b18..0e2c9a1e958a 100644 --- a/lib/cpu-notifier-error-inject.c +++ b/lib/cpu-notifier-error-inject.c @@ -8,16 +8,47 @@ static int priority; module_param(priority, int, 0); MODULE_PARM_DESC(priority, "specify cpu notifier priority"); +#define UP_PREPARE 0 +#define UP_PREPARE_FROZEN 0 +#define DOWN_PREPARE 0 +#define DOWN_PREPARE_FROZEN 0 + static struct notifier_err_inject cpu_notifier_err_inject = { .actions = { - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE_FROZEN) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) }, {} } }; +static int notf_err_handle(struct notifier_err_inject_action *action) +{ + int ret; + + ret = action->error; + if (ret) + pr_info("Injecting error (%d) to %s\n", ret, action->name); + return ret; +} + +static int notf_err_inj_up_prepare(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[0]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[1]); +} + +static int notf_err_inj_dead(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[2]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[3]); +} + static struct dentry *dir; static int err_inject_init(void) @@ -29,7 +60,10 @@ static int err_inject_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - err = register_hotcpu_notifier(&cpu_notifier_err_inject.nb); + err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE, + "cpu-err-notif:prepare", + notf_err_inj_up_prepare, + notf_err_inj_dead); if (err) debugfs_remove_recursive(dir); @@ -38,7 +72,7 @@ static int err_inject_init(void) static void err_inject_exit(void) { - unregister_hotcpu_notifier(&cpu_notifier_err_inject.nb); + cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE); debugfs_remove_recursive(dir); } diff --git a/lib/crc32.c b/lib/crc32.c index 9a907d489d95..7fbd1a112b9d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -979,7 +979,6 @@ static int __init crc32c_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -999,20 +998,17 @@ static int __init crc32c_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc32c_le != __crc32c_le(test[i].crc, test_buf + test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); if (errors) @@ -1065,7 +1061,6 @@ static int __init crc32_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -1088,7 +1083,7 @@ static int __init crc32_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc_le != crc32_le(test[i].crc, test_buf + test[i].start, test[i].length)) @@ -1098,14 +1093,11 @@ static int __init crc32_test(void) test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); diff --git a/lib/digsig.c b/lib/digsig.c index 07be6c1ef4e2..55b8b2f41a9e 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -104,21 +104,25 @@ static int digsig_verify_rsa(struct key *key, datap = pkh->mpi; endp = ukp->data + ukp->datalen; - err = -ENOMEM; - for (i = 0; i < pkh->nmpi; i++) { unsigned int remaining = endp - datap; pkey[i] = mpi_read_from_buffer(datap, &remaining); - if (!pkey[i]) + if (IS_ERR(pkey[i])) { + err = PTR_ERR(pkey[i]); goto err; + } datap += remaining; } mblen = mpi_get_nbits(pkey[0]); mlen = DIV_ROUND_UP(mblen, 8); - if (mlen == 0) + if (mlen == 0) { + err = -EINVAL; goto err; + } + + err = -ENOMEM; out1 = kzalloc(mlen, GFP_KERNEL); if (!out1) @@ -126,8 +130,10 @@ static int digsig_verify_rsa(struct key *key, nret = siglen; in = mpi_read_from_buffer(sig, &nret); - if (!in) + if (IS_ERR(in)) { + err = PTR_ERR(in); goto err; + } res = mpi_alloc(mpi_get_nlimbs(in) * 2); if (!res) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 51a76af25c66..8971370bfb16 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <linux/dma-debug.h> #include <linux/spinlock.h> +#include <linux/vmalloc.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -43,6 +44,7 @@ enum { dma_debug_page, dma_debug_sg, dma_debug_coherent, + dma_debug_resource, }; enum map_err_types { @@ -150,8 +152,9 @@ static const char *const maperr2str[] = { [MAP_ERR_CHECKED] = "dma map error checked", }; -static const char *type2name[4] = { "single", "page", - "scather-gather", "coherent" }; +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; @@ -253,6 +256,7 @@ static int hash_fn(struct dma_debug_entry *entry) */ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, unsigned long *flags) + __acquires(&dma_entry_hash[idx].lock) { int idx = hash_fn(entry); unsigned long __flags; @@ -267,6 +271,7 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, */ static void put_hash_bucket(struct hash_bucket *bucket, unsigned long *flags) + __releases(&bucket->lock) { unsigned long __flags = *flags; @@ -397,6 +402,9 @@ static void hash_bucket_del(struct dma_debug_entry *entry) static unsigned long long phys_addr(struct dma_debug_entry *entry) { + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; } @@ -1162,11 +1170,32 @@ static void check_unmap(struct dma_debug_entry *ref) put_hash_bucket(bucket, &flags); } -static void check_for_stack(struct device *dev, void *addr) +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) { - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from " - "stack [addr=%p]\n", addr); + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } } static inline bool overlap(void *addr, unsigned long len, void *start, void *end) @@ -1289,10 +1318,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, if (map_single) entry->type = dma_debug_single; + check_for_stack(dev, page, offset); + if (!PageHighMem(page)) { void *addr = page_address(page) + offset; - check_for_stack(dev, addr); check_for_illegal_area(dev, addr, size); } @@ -1384,8 +1414,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) { - check_for_stack(dev, sg_virt(s)); check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } @@ -1493,6 +1524,49 @@ void debug_dma_free_coherent(struct device *dev, size_t size, } EXPORT_SYMBOL(debug_dma_free_coherent); +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/lib/dma-noop.c b/lib/dma-noop.c index 72145646857e..3d766e78fbe2 100644 --- a/lib/dma-noop.c +++ b/lib/dma-noop.c @@ -10,7 +10,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, - struct dma_attrs *attrs) + unsigned long attrs) { void *ret; @@ -22,7 +22,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size, static void dma_noop_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, - struct dma_attrs *attrs) + unsigned long attrs) { free_pages((unsigned long)cpu_addr, get_order(size)); } @@ -30,13 +30,14 @@ static void dma_noop_free(struct device *dev, size_t size, static dma_addr_t dma_noop_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { return page_to_phys(page) + offset; } static int dma_noop_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, + unsigned long attrs) { int i; struct scatterlist *sg; diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index fe42b6ec3f0c..da796e2dc4f5 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -188,6 +188,13 @@ static int ddebug_change(const struct ddebug_query *query, newflags = (dp->flags & mask) | flags; if (newflags == dp->flags) continue; +#ifdef HAVE_JUMP_LABEL + if (dp->flags & _DPRINTK_FLAGS_PRINT) { + if (!(flags & _DPRINTK_FLAGS_PRINT)) + static_branch_disable(&dp->key.dd_key_true); + } else if (flags & _DPRINTK_FLAGS_PRINT) + static_branch_enable(&dp->key.dd_key_true); +#endif dp->flags = newflags; vpr_info("changed %s:%d [%s]%s =%s\n", trim_prefix(dp->filename), dp->lineno, diff --git a/lib/earlycpio.c b/lib/earlycpio.c index 3eb3e4722b8e..db283ba4d2c1 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -125,7 +125,10 @@ struct cpio_data find_cpio_data(const char *path, void *data, if ((ch[C_MODE] & 0170000) == 0100000 && ch[C_NAMESIZE] >= mypathsize && !memcmp(p, path, mypathsize)) { - *nextoff = (long)nptr - (long)data; + + if (nextoff) + *nextoff = (long)nptr - (long)data; + if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) { pr_warn( "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", diff --git a/lib/genalloc.c b/lib/genalloc.c index 0a1139644d32..144fe6b1a03e 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -292,7 +292,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, struct gen_pool_chunk *chunk; unsigned long addr = 0; int order = pool->min_alloc_order; - int nbits, start_bit = 0, end_bit, remain; + int nbits, start_bit, end_bit, remain; #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG BUG_ON(in_nmi()); @@ -307,6 +307,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, if (size > atomic_read(&chunk->avail)) continue; + start_bit = 0; end_bit = chunk_size(chunk) >> order; retry: start_bit = algo(chunk->bits, end_bit, start_bit, diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f221558..43273a7d83cf 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c27e269210c4..a816f3a80625 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -29,8 +29,7 @@ again: index = bitmap_find_next_zero_area(map, size, start, nr, align_mask); if (index < size) { if (iommu_is_span_boundary(index, nr, shift, boundary_size)) { - /* we could do more effectively */ - start = index + 1; + start = ALIGN(shift + index, boundary_size) - shift; goto again; } bitmap_set(map, index, nr); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 0cd522753ff5..f2bd21b93dfc 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -3,8 +3,11 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/splice.h> #include <net/checksum.h> +#define PIPE_PARANOIA /* for now */ + #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ size_t left; \ size_t wanted = n; \ @@ -56,37 +59,24 @@ n = wanted; \ } -#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->bvec; \ - __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ - if (likely(__v.bv_len)) { \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset + skip; \ - (void)(STEP); \ - skip += __v.bv_len; \ - n -= __v.bv_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.bv_len = min_t(size_t, n, __p->bv_len); \ - if (unlikely(!__v.bv_len)) \ +#define iterate_bvec(i, n, __v, __bi, skip, STEP) { \ + struct bvec_iter __start; \ + __start.bi_size = n; \ + __start.bi_bvec_done = skip; \ + __start.bi_idx = 0; \ + for_each_bvec(__v, i->bvec, __bi, __start) { \ + if (!__v.bv_len) \ continue; \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset; \ (void)(STEP); \ - skip = __v.bv_len; \ - n -= __v.bv_len; \ } \ - n = wanted; \ } #define iterate_all_kinds(i, n, v, I, B, K) { \ size_t skip = i->iov_offset; \ if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ } else if (unlikely(i->type & ITER_KVEC)) { \ const struct kvec *kvec; \ struct kvec v; \ @@ -104,15 +94,13 @@ if (i->count) { \ size_t skip = i->iov_offset; \ if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ + const struct bio_vec *bvec = i->bvec; \ struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ + i->bvec = __bvec_iter_bvec(i->bvec, __bi); \ + i->nr_segs -= i->bvec - bvec; \ + skip = __bi.bi_bvec_done; \ } else if (unlikely(i->type & ITER_KVEC)) { \ const struct kvec *kvec; \ struct kvec v; \ @@ -159,7 +147,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_writeable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { kaddr = kmap_atomic(page); from = kaddr + offset; @@ -190,6 +178,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); from = kaddr + offset; left = __copy_to_user(buf, from, copy); @@ -208,6 +197,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; @@ -240,7 +230,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_readable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { kaddr = kmap_atomic(page); to = kaddr + offset; @@ -271,6 +261,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); to = kaddr + offset; left = __copy_from_user(to, buf, copy); @@ -289,6 +280,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; @@ -301,25 +293,92 @@ done: return wanted - bytes; } -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +#ifdef PIPE_PARANOIA +static bool sanity(const struct iov_iter *i) { - if (!(i->type & (ITER_BVEC|ITER_KVEC))) { - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); + struct pipe_inode_info *pipe = i->pipe; + int idx = i->idx; + int next = pipe->curbuf + pipe->nrbufs; + if (i->iov_offset) { + struct pipe_buffer *p; + if (unlikely(!pipe->nrbufs)) + goto Bad; // pipe must be non-empty + if (unlikely(idx != ((next - 1) & (pipe->buffers - 1)))) + goto Bad; // must be at the last buffer... + + p = &pipe->bufs[idx]; + if (unlikely(p->offset + p->len != i->iov_offset)) + goto Bad; // ... at the end of segment + } else { + if (idx != (next & (pipe->buffers - 1))) + goto Bad; // must be right after the last buffer } - return 0; + return true; +Bad: + printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset); + printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n", + pipe->curbuf, pipe->nrbufs, pipe->buffers); + for (idx = 0; idx < pipe->buffers; idx++) + printk(KERN_ERR "[%p %p %d %d]\n", + pipe->bufs[idx].ops, + pipe->bufs[idx].page, + pipe->bufs[idx].offset, + pipe->bufs[idx].len); + WARN_ON(1); + return false; +} +#else +#define sanity(i) true +#endif + +static inline int next_idx(int idx, struct pipe_inode_info *pipe) +{ + return (idx + 1) & (pipe->buffers - 1); +} + +static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + size_t off; + int idx; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + off = i->iov_offset; + idx = i->idx; + buf = &pipe->bufs[idx]; + if (off) { + if (offset == off && buf->page == page) { + /* merge with the last one */ + buf->len += bytes; + i->iov_offset += bytes; + goto out; + } + idx = next_idx(idx, pipe); + buf = &pipe->bufs[idx]; + } + if (idx == pipe->curbuf && pipe->nrbufs) + return 0; + pipe->nrbufs++; + buf->ops = &page_cache_pipe_buf_ops; + get_page(buf->page = page); + buf->offset = offset; + buf->len = bytes; + i->iov_offset = offset + bytes; + i->idx = idx; +out: + i->count -= bytes; + return bytes; } -EXPORT_SYMBOL(iov_iter_fault_in_readable); /* * Fault in one or more iovecs of the given iov_iter, to a maximum length of @@ -328,7 +387,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable); * Return 0 on success, or non-zero if the memory could not be accessed (i.e. * because it is an invalid address). */ -int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes) +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) { size_t skip = i->iov_offset; const struct iovec *iov; @@ -337,15 +396,14 @@ int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes) if (!(i->type & (ITER_BVEC|ITER_KVEC))) { iterate_iovec(i, bytes, v, iov, skip, ({ - err = fault_in_multipages_readable(v.iov_base, - v.iov_len); + err = fault_in_pages_readable(v.iov_base, v.iov_len); if (unlikely(err)) return err; 0;})) } return 0; } -EXPORT_SYMBOL(iov_iter_fault_in_multipages_readable); +EXPORT_SYMBOL(iov_iter_fault_in_readable); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, @@ -387,9 +445,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len) kunmap_atomic(addr); } +static inline bool allocated(struct pipe_buffer *buf) +{ + return buf->ops == &default_pipe_buf_ops; +} + +static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp) +{ + size_t off = i->iov_offset; + int idx = i->idx; + if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) { + idx = next_idx(idx, i->pipe); + off = 0; + } + *idxp = idx; + *offp = off; +} + +static size_t push_pipe(struct iov_iter *i, size_t size, + int *idxp, size_t *offp) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + ssize_t left; + + if (unlikely(size > i->count)) + size = i->count; + if (unlikely(!size)) + return 0; + + left = size; + data_start(i, &idx, &off); + *idxp = idx; + *offp = off; + if (off) { + left -= PAGE_SIZE - off; + if (left <= 0) { + pipe->bufs[idx].len += size; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + idx = next_idx(idx, pipe); + } + while (idx != pipe->curbuf || !pipe->nrbufs) { + struct page *page = alloc_page(GFP_USER); + if (!page) + break; + pipe->nrbufs++; + pipe->bufs[idx].ops = &default_pipe_buf_ops; + pipe->bufs[idx].page = page; + pipe->bufs[idx].offset = 0; + if (left <= PAGE_SIZE) { + pipe->bufs[idx].len = left; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + left -= PAGE_SIZE; + idx = next_idx(idx, pipe); + } + return size - left; +} + +static size_t copy_pipe_to_iter(const void *addr, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + addr += chunk; + } + i->count -= bytes; + return bytes; +} + size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; + if (unlikely(i->type & ITER_PIPE)) + return copy_pipe_to_iter(addr, bytes, i); iterate_and_advance(i, bytes, v, __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), @@ -405,6 +552,10 @@ EXPORT_SYMBOL(copy_to_iter); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -420,6 +571,10 @@ EXPORT_SYMBOL(copy_from_iter); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -440,14 +595,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else + } else if (likely(!(i->type & ITER_PIPE))) return copy_page_to_iter_iovec(page, offset, bytes, i); + else + return copy_page_to_iter_pipe(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } if (i->type & (ITER_BVEC|ITER_KVEC)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_from_iter(kaddr + offset, bytes, i); @@ -458,8 +619,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +static size_t pipe_zero(size_t bytes, struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memzero_page(pipe->bufs[idx].page, off, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + } + i->count -= bytes; + return bytes; +} + size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, __clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), @@ -474,6 +661,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { char *kaddr = kmap_atomic(page), *p = kaddr + offset; + if (unlikely(i->type & ITER_PIPE)) { + kunmap_atomic(kaddr); + WARN_ON(1); + return 0; + } iterate_all_kinds(i, bytes, v, __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -486,8 +678,51 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); +static void pipe_advance(struct iov_iter *i, size_t size) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + int idx = i->idx; + size_t off = i->iov_offset, orig_sz; + + if (unlikely(i->count < size)) + size = i->count; + orig_sz = size; + + if (size) { + if (off) /* make it relative to the beginning of buffer */ + size += off - pipe->bufs[idx].offset; + while (1) { + buf = &pipe->bufs[idx]; + if (size <= buf->len) + break; + size -= buf->len; + idx = next_idx(idx, pipe); + } + buf->len = size; + i->idx = idx; + off = i->iov_offset = buf->offset + size; + } + if (off) + idx = next_idx(idx, pipe); + if (pipe->nrbufs) { + int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + /* [curbuf,unused) is in use. Free [idx,unused) */ + while (idx != unused) { + pipe_buf_release(pipe, &pipe->bufs[idx]); + idx = next_idx(idx, pipe); + pipe->nrbufs--; + } + } + i->count -= orig_sz; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { + if (unlikely(i->type & ITER_PIPE)) { + pipe_advance(i, size); + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -497,6 +732,8 @@ EXPORT_SYMBOL(iov_iter_advance); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) @@ -532,6 +769,19 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); +void iov_iter_pipe(struct iov_iter *i, int direction, + struct pipe_inode_info *pipe, + size_t count) +{ + BUG_ON(direction != ITER_PIPE); + i->type = direction; + i->pipe = pipe; + i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_pipe); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; @@ -540,6 +790,11 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (!size) return 0; + if (unlikely(i->type & ITER_PIPE)) { + if (i->iov_offset && allocated(&i->pipe->bufs[i->idx])) + return size | i->iov_offset; + return size; + } iterate_all_kinds(i, size, v, (res |= (unsigned long)v.iov_base | v.iov_len, 0), res |= v.bv_offset | v.bv_len, @@ -556,6 +811,11 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) if (!size) return 0; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return ~0U; + } + iterate_all_kinds(i, size, v, (res |= (!res ? 0 : (unsigned long)v.iov_base) | (size != v.iov_len ? size : 0), 0), @@ -568,6 +828,47 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_gap_alignment); +static inline size_t __pipe_get_pages(struct iov_iter *i, + size_t maxsize, + struct page **pages, + int idx, + size_t *start) +{ + struct pipe_inode_info *pipe = i->pipe; + ssize_t n = push_pipe(i, maxsize, &idx, start); + if (!n) + return -EFAULT; + + maxsize = n; + n += *start; + while (n > 0) { + get_page(*pages++ = pipe->bufs[idx].page); + idx = next_idx(idx, pipe); + n -= PAGE_SIZE; + } + + return maxsize; +} + +static ssize_t pipe_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + unsigned npages; + size_t capacity; + int idx; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + capacity = min(npages,maxpages) * PAGE_SIZE - *start; + + return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start); +} + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) @@ -578,6 +879,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (!maxsize) return 0; + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -613,6 +916,37 @@ static struct page **get_pages_array(size_t n) return p; } +static ssize_t pipe_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + struct page **p; + size_t n; + int idx; + int npages; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + n = npages * PAGE_SIZE - *start; + if (maxsize > n) + maxsize = n; + else + npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); + p = get_pages_array(npages); + if (!p) + return -ENOMEM; + n = __pipe_get_pages(i, maxsize, p, idx, start); + if (n > 0) + *pages = p; + else + kvfree(p); + return n; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) @@ -625,6 +959,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (!maxsize) return 0; + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -666,6 +1002,10 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_from_user(v.iov_base, @@ -704,6 +1044,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); /* for now */ + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, @@ -743,7 +1087,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; - iterate_all_kinds(i, size, v, ({ + if (unlikely(i->type & ITER_PIPE)) { + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + + if (!sanity(i)) + return 0; + + data_start(i, &idx, &off); + /* some of this one + all after this one */ + npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1; + if (npages >= maxpages) + return maxpages; + } else iterate_all_kinds(i, size, v, ({ unsigned long p = (unsigned long)v.iov_base; npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - p / PAGE_SIZE; @@ -768,6 +1125,10 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; + if (unlikely(new->type & ITER_PIPE)) { + WARN_ON(1); + return NULL; + } if (new->type & ITER_BVEC) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), @@ -780,6 +1141,28 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) } EXPORT_SYMBOL(dup_iter); +/** + * import_iovec() - Copy an array of &struct iovec from userspace + * into the kernel, check that it is valid, and initialize a new + * &struct iov_iter iterator to access it. + * + * @type: One of %READ or %WRITE. + * @uvector: Pointer to the userspace array. + * @nr_segs: Number of elements in userspace array. + * @fast_segs: Number of elements in @iov. + * @iov: (input and output parameter) Pointer to pointer to (usually small + * on-stack) kernel array. + * @i: Pointer to iterator that will be initialized on success. + * + * If the array pointed to by *@iov is large enough to hold all @nr_segs, + * then this function places %NULL in *@iov on return. Otherwise, a new + * array will be allocated and the result placed in *@iov. This means that + * the caller may call kfree() on *@iov regardless of whether the small + * on-stack array was used or not (and regardless of whether this function + * returns an error or not). + * + * Return: 0 on success or negative error code on error. + */ int import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i) diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 836f7db4e548..1d6565e81030 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(struct softirq_action *h) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; @@ -184,30 +184,21 @@ void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn) } EXPORT_SYMBOL(irq_poll_init); -static int irq_poll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int irq_poll_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block irq_poll_cpu_notifier = { - .notifier_call = irq_poll_cpu_notify, -}; - static __init int irq_poll_setup(void) { int i; @@ -216,7 +207,8 @@ static __init int irq_poll_setup(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq); - register_hotcpu_notifier(&irq_poll_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL, + irq_poll_cpu_dead); return 0; } subsys_initcall(irq_poll_setup); diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d8a5cf66c316..b8e2080c1a47 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -48,11 +48,9 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long { unsigned long long res; unsigned int rv; - int overflow; res = 0; rv = 0; - overflow = 0; while (*s) { unsigned int val; @@ -71,15 +69,13 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long */ if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) - overflow = 1; + rv |= KSTRTOX_OVERFLOW; } res = res * base + val; rv++; s++; } *p = res; - if (overflow) - rv |= KSTRTOX_OVERFLOW; return rv; } diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 747606f9e4a3..5a0f75a3bf01 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -21,6 +21,7 @@ #include <linux/bitops.h> #include <linux/count_zeros.h> #include <linux/byteorder/generic.h> +#include <linux/scatterlist.h> #include <linux/string.h> #include "mpi-internal.h" @@ -50,9 +51,7 @@ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) return NULL; } if (nbytes > 0) - nbits -= count_leading_zeros(buffer[0]); - else - nbits = 0; + nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -82,50 +81,30 @@ EXPORT_SYMBOL_GPL(mpi_read_raw_data); MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; - int i, j; - unsigned nbits, nbytes, nlimbs, nread = 0; - mpi_limb_t a; - MPI val = NULL; + unsigned int nbits, nbytes; + MPI val; if (*ret_nread < 2) - goto leave; + return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); - goto leave; + return ERR_PTR(-EINVAL); } - buffer += 2; - nread = 2; nbytes = DIV_ROUND_UP(nbits, 8); - nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); - val = mpi_alloc(nlimbs); - if (!val) - return NULL; - i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; - i %= BYTES_PER_MPI_LIMB; - val->nbits = nbits; - j = val->nlimbs = nlimbs; - val->sign = 0; - for (; j > 0; j--) { - a = 0; - for (; i < BYTES_PER_MPI_LIMB; i++) { - if (++nread > *ret_nread) { - printk - ("MPI: mpi larger than buffer nread=%d ret_nread=%d\n", - nread, *ret_nread); - goto leave; - } - a <<= 8; - a |= *buffer++; - } - i = 0; - val->d[j - 1] = a; + if (nbytes + 2 > *ret_nread) { + pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", + nbytes, *ret_nread); + return ERR_PTR(-EINVAL); } -leave: - *ret_nread = nread; + val = mpi_read_raw_data(buffer + 2, nbytes); + if (!val) + return ERR_PTR(-ENOMEM); + + *ret_nread = nbytes + 2; return val; } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); @@ -250,82 +229,6 @@ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign) } EXPORT_SYMBOL_GPL(mpi_get_buffer); -/**************** - * Use BUFFER to update MPI. - */ -int mpi_set_buffer(MPI a, const void *xbuffer, unsigned nbytes, int sign) -{ - const uint8_t *buffer = xbuffer, *p; - mpi_limb_t alimb; - int nlimbs; - int i; - - nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); - if (RESIZE_IF_NEEDED(a, nlimbs) < 0) - return -ENOMEM; - a->sign = sign; - - for (i = 0, p = buffer + nbytes - 1; p >= buffer + BYTES_PER_MPI_LIMB;) { -#if BYTES_PER_MPI_LIMB == 4 - alimb = (mpi_limb_t) *p--; - alimb |= (mpi_limb_t) *p-- << 8; - alimb |= (mpi_limb_t) *p-- << 16; - alimb |= (mpi_limb_t) *p-- << 24; -#elif BYTES_PER_MPI_LIMB == 8 - alimb = (mpi_limb_t) *p--; - alimb |= (mpi_limb_t) *p-- << 8; - alimb |= (mpi_limb_t) *p-- << 16; - alimb |= (mpi_limb_t) *p-- << 24; - alimb |= (mpi_limb_t) *p-- << 32; - alimb |= (mpi_limb_t) *p-- << 40; - alimb |= (mpi_limb_t) *p-- << 48; - alimb |= (mpi_limb_t) *p-- << 56; -#else -#error please implement for this limb size. -#endif - a->d[i++] = alimb; - } - if (p >= buffer) { -#if BYTES_PER_MPI_LIMB == 4 - alimb = *p--; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 8; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 16; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 24; -#elif BYTES_PER_MPI_LIMB == 8 - alimb = (mpi_limb_t) *p--; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 8; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 16; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 24; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 32; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 40; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 48; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 56; -#else -#error please implement for this limb size. -#endif - a->d[i++] = alimb; - } - a->nlimbs = i; - - if (i != nlimbs) { - pr_emerg("MPI: mpi_set_buffer: Assertion failed (%d != %d)", i, - nlimbs); - BUG(); - } - return 0; -} -EXPORT_SYMBOL_GPL(mpi_set_buffer); - /** * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first) * @@ -335,16 +238,13 @@ EXPORT_SYMBOL_GPL(mpi_set_buffer); * @a: a multi precision integer * @sgl: scatterlist to write to. Needs to be at least * mpi_get_size(a) long. - * @nbytes: in/out param - it has the be set to the maximum number of - * bytes that can be written to sgl. This has to be at least - * the size of the integer a. On return it receives the actual - * length of the data written on success or the data that would - * be written if buffer was too small. + * @nbytes: the number of bytes to write. Leading bytes will be + * filled with zero. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ -int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, +int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes, int *sign) { u8 *p, *p2; @@ -356,55 +256,60 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); - int i, x, y = 0, lzeros, buf_len; - - if (!nbytes) - return -EINVAL; + struct sg_mapping_iter miter; + int i, x, buf_len; + int nents; if (sign) *sign = a->sign; - lzeros = count_lzeros(a); - - if (*nbytes < n - lzeros) { - *nbytes = n - lzeros; + if (nbytes < n) return -EOVERFLOW; - } - *nbytes = n - lzeros; - buf_len = sgl->length; - p2 = sg_virt(sgl); + nents = sg_nents_for_len(sgl, nbytes); + if (nents < 0) + return -EINVAL; - for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, - lzeros %= BYTES_PER_MPI_LIMB; - i >= 0; i--) { + sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG); + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + + while (nbytes > n) { + i = min_t(unsigned, nbytes - n, buf_len); + memset(p2, 0, i); + p2 += i; + nbytes -= i; + + buf_len -= i; + if (!buf_len) { + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + } + } + + for (i = a->nlimbs - 1; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 - alimb = cpu_to_be32(a->d[i]); + alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0; #elif BYTES_PER_MPI_LIMB == 8 - alimb = cpu_to_be64(a->d[i]); + alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0; #else #error please implement for this limb size. #endif - if (lzeros) { - y = lzeros; - lzeros = 0; - } - - p = (u8 *)&alimb + y; + p = (u8 *)&alimb; - for (x = 0; x < sizeof(alimb) - y; x++) { - if (!buf_len) { - sgl = sg_next(sgl); - if (!sgl) - return -EINVAL; - buf_len = sgl->length; - p2 = sg_virt(sgl); - } + for (x = 0; x < sizeof(alimb); x++) { *p2++ = *p++; - buf_len--; + if (!--buf_len) { + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + } } - y = 0; } + + sg_miter_stop(&miter); return 0; } EXPORT_SYMBOL_GPL(mpi_write_to_sgl); @@ -424,19 +329,23 @@ EXPORT_SYMBOL_GPL(mpi_write_to_sgl); */ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { - struct scatterlist *sg; - int x, i, j, z, lzeros, ents; + struct sg_mapping_iter miter; unsigned int nbits, nlimbs; + int x, j, z, lzeros, ents; + unsigned int len; + const u8 *buff; mpi_limb_t a; MPI val = NULL; - lzeros = 0; - ents = sg_nents(sgl); + ents = sg_nents_for_len(sgl, nbytes); + if (ents < 0) + return NULL; - for_each_sg(sgl, sg, ents, i) { - const u8 *buff = sg_virt(sg); - int len = sg->length; + sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG); + lzeros = 0; + len = 0; + while (nbytes > 0) { while (len && !*buff) { lzeros++; len--; @@ -446,12 +355,17 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) if (len && *buff) break; - ents--; + sg_miter_next(&miter); + buff = miter.addr; + len = miter.length; + nbytes -= lzeros; lzeros = 0; } - sgl = sg; + miter.consumed = lzeros; + sg_miter_stop(&miter); + nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { @@ -460,8 +374,7 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) } if (nbytes > 0) - nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)) - - (BITS_PER_LONG - 8); + nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -480,21 +393,21 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; z %= BYTES_PER_MPI_LIMB; - for_each_sg(sgl, sg, ents, i) { - const u8 *buffer = sg_virt(sg) + lzeros; - int len = sg->length - lzeros; + while (sg_miter_next(&miter)) { + buff = miter.addr; + len = miter.length; for (x = 0; x < len; x++) { a <<= 8; - a |= *buffer++; + a |= *buff++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { val->d[j--] = a; a = 0; } } z += x; - lzeros = 0; } + return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl); diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 26caf51cc238..75554754eadf 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,21 +16,23 @@ #include <linux/delay.h> #include <linux/kprobes.h> #include <linux/nmi.h> +#include <linux/cpu.h> -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +/* "in progress" flag of arch_trigger_cpumask_backtrace */ static unsigned long backtrace_flag; /* - * When raise() is called it will be is passed a pointer to the + * When raise() is called it will be passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() * directly from their raise() functions may rely on the mask * they are passed being updated as a side effect of this call. */ -void nmi_trigger_all_cpu_backtrace(bool include_self, +void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self, void (*raise)(cpumask_t *mask)) { int i, this_cpu = get_cpu(); @@ -44,13 +46,22 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, return; } - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) + cpumask_copy(to_cpumask(backtrace_mask), mask); + if (exclude_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + /* + * Don't try to send an NMI to this cpu; it may work on some + * architectures, but on others it may not, and we'll get + * information at least as useful just by doing a dump_stack() here. + * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit. + */ + if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask))) + nmi_cpu_backtrace(NULL); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("Sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); + pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n", + this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask)); raise(to_cpumask(backtrace_mask)); } @@ -77,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - pr_warn("NMI backtrace for cpu %d\n", cpu); - if (regs) - show_regs(regs); - else - dump_stack(); + if (regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", + cpu, instruction_pointer(regs)); + } else { + pr_warn("NMI backtrace for cpu %d\n", cpu); + if (regs) + show_regs(regs); + else + dump_stack(); + } cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 27fe74948882..9ac959ef4cae 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -33,6 +33,7 @@ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) @@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, atomic_long_set(&ref->count, start_count); ref->release = release; + ref->confirm_switch = NULL; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); @@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref) unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { + /* non-NULL confirm_switch indicates switching in progress */ + WARN_ON_ONCE(ref->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } @@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { - if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { - /* switching from percpu to atomic */ - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; - - /* - * Non-NULL ->confirm_switch is used to indicate that - * switching is in progress. Use noop one if unspecified. - */ - WARN_ON_ONCE(ref->confirm_switch); - ref->confirm_switch = - confirm_switch ?: percpu_ref_noop_confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); - } else if (confirm_switch) { - /* - * Somebody already set ATOMIC. Switching may still be in - * progress. @confirm_switch must be invoked after the - * switching is complete and a full sched RCU grace period - * has passed. Wait synchronously for the previous - * switching and schedule @confirm_switch invocation. - */ - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - ref->confirm_switch = confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { + if (confirm_switch) + confirm_switch(ref); + return; } -} -/** - * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode - * @ref: percpu_ref to switch to atomic mode - * @confirm_switch: optional confirmation callback - * - * There's no reason to use this function for the usual reference counting. - * Use percpu_ref_kill[_and_confirm](). - * - * Schedule switching of @ref to atomic mode. All its percpu counts will - * be collected to the main atomic counter. On completion, when all CPUs - * are guaraneed to be in atomic mode, @confirm_switch, which may not - * block, is invoked. This function may be invoked concurrently with all - * the get/put operations and can safely be mixed with kill and reinit - * operations. Note that @ref will stay in atomic mode across kill/reinit - * cycles until percpu_ref_switch_to_percpu() is called. - * - * This function normally doesn't block and can be called from any context - * but it may block if @confirm_kill is specified and @ref is already in - * the process of switching to atomic mode. In such cases, @confirm_switch - * will be invoked after the switching is complete. - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. - */ -void percpu_ref_switch_to_atomic(struct percpu_ref *ref, - percpu_ref_func_t *confirm_switch) -{ - ref->force_atomic = true; - __percpu_ref_switch_to_atomic(ref, confirm_switch); + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that switching is + * in progress. Use noop one if unspecified. + */ + ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) @@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* @@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } +static void __percpu_ref_switch_mode(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + lockdep_assert_held(&percpu_ref_switch_lock); + + /* + * If the previous ATOMIC switching hasn't finished yet, wait for + * its completion. If the caller ensures that ATOMIC switching + * isn't in progress, this function can be called from any context. + */ + wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch, + percpu_ref_switch_lock); + + if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_atomic(ref, confirm_switch); + else + __percpu_ref_switch_to_percpu(ref); +} + +/** + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback + * + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. + * + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. + */ +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + + ref->force_atomic = true; + __percpu_ref_switch_mode(ref, confirm_switch); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); +} + /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode @@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * - * This function normally doesn't block and can be called from any context - * but it may block if @ref is in the process of switching to atomic mode - * by percpu_ref_switch_atomic(). + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + ref->force_atomic = false; + __percpu_ref_switch_mode(ref, NULL); - /* a dying or dead ref can't be switched to percpu mode w/o reinit */ - if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) - __percpu_ref_switch_to_percpu(ref); + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } /** @@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the - * process of switching to atomic mode by percpu_ref_switch_atomic(). - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. + * process of switching to atomic mode by percpu_ref_switch_to_atomic(). */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, "%s called more than once on %pf!", __func__, ref->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; - __percpu_ref_switch_to_atomic(ref, confirm_kill); + __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); @@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - if (!ref->force_atomic) - __percpu_ref_switch_to_percpu(ref); + __percpu_ref_switch_mode(ref, NULL); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8b7d8459bb9d..8e6d552c40dd 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,6 +38,9 @@ #include <linux/preempt.h> /* in_interrupt() */ +/* Number of nodes in fully populated tree of given height */ +static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; + /* * Radix tree node cache. */ @@ -102,10 +105,10 @@ static unsigned int radix_tree_descend(struct radix_tree_node *parent, #ifdef CONFIG_RADIX_TREE_MULTIORDER if (radix_tree_is_internal_node(entry)) { - unsigned long siboff = get_slot_offset(parent, entry); - if (siboff < RADIX_TREE_MAP_SIZE) { - offset = siboff; - entry = rcu_dereference_raw(parent->slots[offset]); + if (is_sibling_entry(parent, entry)) { + void **sibentry = (void **) entry_to_node(entry); + offset = get_slot_offset(parent, sibentry); + entry = rcu_dereference_raw(*sibentry); } } #endif @@ -274,10 +277,11 @@ radix_tree_node_alloc(struct radix_tree_root *root) /* * Even if the caller has preloaded, try to allocate from the - * cache first for the new node to get accounted. + * cache first for the new node to get accounted to the memory + * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN); + gfp_mask | __GFP_NOWARN); if (ret) goto out; @@ -300,8 +304,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) kmemleak_update_trace(ret); goto out; } - ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT); + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); return ret; @@ -342,22 +345,28 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask, int nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; + /* + * Nodes preloaded by one cgroup can be be used by another cgroup, so + * they should never be accounted to any particular memory cgroup. + */ + gfp_mask &= ~__GFP_ACCOUNT; + preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + while (rtp->nr < nr) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + if (rtp->nr < nr) { node->private_data = rtp->nodes; rtp->nodes = node; rtp->nr++; @@ -383,7 +392,7 @@ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); @@ -395,7 +404,7 @@ EXPORT_SYMBOL(radix_tree_preload); int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); return 0; @@ -403,6 +412,51 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) EXPORT_SYMBOL(radix_tree_maybe_preload); /* + * The same as function above, but preload number of nodes required to insert + * (1 << order) continuous naturally-aligned elements. + */ +int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) +{ + unsigned long nr_subtrees; + int nr_nodes, subtree_height; + + /* Preloading doesn't help anything with this gfp mask, skip it */ + if (!gfpflags_allow_blocking(gfp_mask)) { + preempt_disable(); + return 0; + } + + /* + * Calculate number and height of fully populated subtrees it takes to + * store (1 << order) elements. + */ + nr_subtrees = 1 << order; + for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; + subtree_height++) + nr_subtrees >>= RADIX_TREE_MAP_SHIFT; + + /* + * The worst case is zero height tree with a single item at index 0 and + * then inserting items starting at ULONG_MAX - (1 << order). + * + * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to + * 0-index item. + */ + nr_nodes = RADIX_TREE_MAX_PATH; + + /* Plus branch to fully populated subtrees. */ + nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; + + /* Root node is shared. */ + nr_nodes--; + + /* Plus nodes required to build subtrees. */ + nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; + + return __radix_tree_preload(gfp_mask, nr_nodes); +} + +/* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) @@ -1529,15 +1583,10 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); -struct radix_tree_node *radix_tree_replace_clear_tags( - struct radix_tree_root *root, - unsigned long index, void *entry) +void radix_tree_clear_tags(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(root, index, &node, &slot); - if (node) { unsigned int tag, offset = get_slot_offset(node, slot); for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) @@ -1546,9 +1595,6 @@ struct radix_tree_node *radix_tree_replace_clear_tags( /* Clear root node tags */ root->gfp_mask &= __GFP_BITS_MASK; } - - radix_tree_replace_slot(slot, entry); - return node; } /** @@ -1571,6 +1617,31 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } +static __init unsigned long __maxindex(unsigned int height) +{ + unsigned int width = height * RADIX_TREE_MAP_SHIFT; + int shift = RADIX_TREE_INDEX_BITS - width; + + if (shift < 0) + return ~0UL; + if (shift >= BITS_PER_LONG) + return 0UL; + return ~0UL >> shift; +} + +static __init void radix_tree_init_maxnodes(void) +{ + unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); + for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { + for (j = i; j > 0; j--) + height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; + } +} + static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1597,5 +1668,6 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); + radix_tree_init_maxnodes(); hotcpu_notifier(radix_tree_callback, 0); } diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 0a7e494b2bcd..f01b1cb04f91 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore @@ -3,3 +3,4 @@ altivec*.c int*.c tables.c neon?.c +s390vx?.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3b10a48fa040..3057011f5599 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -3,10 +3,11 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o -raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o raid6_pq-$(CONFIG_TILEGX) += tilegx8.o +raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o hostprogs-y += mktables @@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8 $(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) +targets += s390vx8.c +$(obj)/s390vx8.c: UNROLL := 8 +$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + quiet_cmd_mktable = TABLE $@ cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 975c6e0434bd..7857049fd7d3 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x1, &raid6_avx2x2, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, +#endif #endif #if defined(__x86_64__) && !defined(__arch_um__) &raid6_sse2x1, @@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x2, &raid6_avx2x4, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, + &raid6_avx512x4, +#endif #endif #ifdef CONFIG_ALTIVEC &raid6_altivec1, @@ -69,6 +78,9 @@ const struct raid6_calls * const raid6_algos[] = { #if defined(CONFIG_TILEGX) &raid6_tilegx8, #endif +#if defined(CONFIG_S390) + &raid6_s390vx8, +#endif &raid6_intx1, &raid6_intx2, &raid6_intx4, @@ -89,12 +101,18 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { +#ifdef CONFIG_AS_AVX512 + &raid6_recov_avx512, +#endif #ifdef CONFIG_AS_AVX2 &raid6_recov_avx2, #endif #ifdef CONFIG_AS_SSSE3 &raid6_recov_ssse3, #endif +#ifdef CONFIG_S390 + &raid6_recov_s390xc, +#endif &raid6_recov_intx1, NULL }; diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c new file mode 100644 index 000000000000..f524a7972006 --- /dev/null +++ b/lib/raid6/avx512.c @@ -0,0 +1,569 @@ +/* -*- linux-c -*- -------------------------------------------------------- + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- + */ + +/* + * AVX512 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx512_constants { + u64 x1d[8]; +} raid6_avx512_constants __aligned(512) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "prefetchnta %1\n\t" + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %1,%%zmm6" + : + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm6" + : + : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm4,%1\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4" + : + : "m" (p[d]), "m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm2\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2" + : + : "m" (dptr[z0][d]), "m" (p[d])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : "m" (dptr[z][d])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + /* Don't use movntdq for r/w memory area < cache line */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm2,%1" + : + : "m" (q[d]), "m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x1 = { + raid6_avx5121_gen_syndrome, + raid6_avx5121_xor_syndrome, + raid6_have_avx512, + "avx512x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX512 implementation + */ +static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + /* We uniformly assume a single prefetch covers at least 64 bytes */ + for (d = 0; d < bytes; d += 128) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm4,%2\n\t" + "vmovntdq %%zmm6,%3" + : + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), + "m" (q[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm2\n\t" + "vmovdqa64 %3,%%zmm3\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (p[d]), "m" (p[d+64])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + "vpxorq %1,%%zmm6,%%zmm6\n\t" + /* Don't use movntdq for r/w + * memory area < cache line + */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm6,%1\n\t" + "vmovdqa64 %%zmm2,%2\n\t" + "vmovdqa64 %%zmm3,%3" + : + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), + "m" (p[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x2 = { + raid6_avx5122_gen_syndrome, + raid6_avx5122_xor_syndrome, + raid6_have_avx512, + "avx512x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 256) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "prefetchnta %2\n\t" + "prefetchnta %3\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" + "vmovntdq %%zmm14,%7\n\t" + "vpxorq %%zmm14,%%zmm14,%%zmm14" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + :: "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 256) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm12\n\t" + "vmovdqa64 %3,%%zmm14\n\t" + "vmovdqa64 %4,%%zmm2\n\t" + "vmovdqa64 %5,%%zmm3\n\t" + "vmovdqa64 %6,%%zmm10\n\t" + "vmovdqa64 %7,%%zmm11\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm14,%%zmm11,%%zmm11" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "prefetchnta %0\n\t" + "prefetchnta %2\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), + "m" (dptr[z][d+192])); + } + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + : + : "m" (q[d]), "m" (q[d+128])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : ); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %4,%%zmm4,%%zmm4\n\t" + "vpxorq %5,%%zmm6,%%zmm6\n\t" + "vpxorq %6,%%zmm12,%%zmm12\n\t" + "vpxorq %7,%%zmm14,%%zmm14\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vmovntdq %%zmm14,%7" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} +const struct raid6_calls raid6_avx512x4 = { + raid6_avx5124_gen_syndrome, + raid6_avx5124_xor_syndrome, + raid6_have_avx512, + "avx512x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX512 */ diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c new file mode 100644 index 000000000000..625aafa33b61 --- /dev/null +++ b/lib/raid6/recov_avx512.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* zmm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm9\n\t" + "vmovdqa64 %2, %%zmm0\n\t" + "vmovdqa64 %3, %%zmm8\n\t" + "vpxorq %4, %%zmm1, %%zmm1\n\t" + "vpxorq %5, %%zmm9, %%zmm9\n\t" + "vpxorq %6, %%zmm0, %%zmm0\n\t" + "vpxorq %7, %%zmm8, %%zmm8" + : + : "m" (q[0]), "m" (q[64]), "m" (p[0]), + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), + "m" (dp[0]), "m" (dp[64])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[64] ^ q[64] + * 0 = dp[0] ^ p[0] + * 8 = dp[64] ^ p[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpsraw $4, %%zmm9, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* + * 5 = qx[0] + * 15 = qx[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpsraw $4, %%zmm8, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm12, %%zmm13, %%zmm13" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[64]] + */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm15, %%zmm13, %%zmm13" + : + : ); + + /* + * 1 = db = DQ + * 13 = db[64] = DQ[64] + */ + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm13,%1\n\t" + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vpxorq %%zmm13, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64])); + + asm volatile("vmovdqa64 %%zmm0, %0\n\t" + "vmovdqa64 %%zmm8, %1" + : + : "m" (dp[0]), "m" (dp[64])); + + bytes -= 128; + p += 128; + q += 128; + dp += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm0\n\t" + "vpxorq %2, %%zmm1, %%zmm1\n\t" + "vpxorq %3, %%zmm0, %%zmm0" + : + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* 5 = qx */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1" + : + : ); + + /* 1 = pbmul[px] */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + /* 1 = db = DQ */ + "vmovdqa64 %%zmm1, %0\n\t" + : + : "m" (dq[0])); + + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqa64 %%zmm0, %0" + : + : "m" (dp[0])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vmovdqa64 %1, %%zmm8\n\t" + "vpxorq %2, %%zmm3, %%zmm3\n\t" + "vpxorq %3, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), + "m" (q[64])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[64] ^ dq[64] + */ + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vmovapd %%zmm0, %%zmm13\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vmovapd %%zmm1, %%zmm14" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpsraw $4, %%zmm8, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm13, %%zmm14, %%zmm14" + : + : ); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[64] ^ dq[64]] + */ + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vmovdqa64 %1, %%zmm12\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" + "vpxorq %%zmm14, %%zmm12, %%zmm12" + : + : "m" (p[0]), "m" (p[64])); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] + */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm14, %1\n\t" + "vmovdqa64 %%zmm2, %2\n\t" + "vmovdqa64 %%zmm12,%3" + : + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), + "m" (p[64])); + + bytes -= 128; + p += 128; + q += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vpxorq %1, %%zmm3, %%zmm3" + : + : "m" (dq[0]), "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1" + : + : ); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2" + : + : "m" (p[0])); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm2, %1" + : + : "m" (dq[0]), "m" (p[0])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx512 = { + .data2 = raid6_2data_recov_avx512, + .datap = raid6_datap_recov_avx512, + .valid = raid6_has_avx512, +#ifdef CONFIG_X86_64 + .name = "avx512x2", +#else + .name = "avx512x1", +#endif + .priority = 3, +}; + +#else +#warning "your version of binutils lacks AVX512 support" +#endif diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c new file mode 100644 index 000000000000..b042dac826cc --- /dev/null +++ b/lib/raid6/recov_s390xc.c @@ -0,0 +1,116 @@ +/* + * RAID-6 data recovery in dual failure mode based on the XC instruction. + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/export.h> +#include <linux/raid/pq.h> + +static inline void xor_block(u8 *p1, u8 *p2) +{ + typedef struct { u8 _[256]; } addrtype; + + asm volatile( + " xc 0(256,%[p1]),0(%[p2])\n" + : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2), + [p1] "a" (p1), [p2] "a" (p2) : "cc"); +} + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dp, p); + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = pbmul[dp[i]] ^ qmul[dq[i]]; + xor_block(dp, dq); + p += 256; + q += 256; + dp += 256; + dq += 256; + bytes -= 256; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = qmul[dq[i]]; + xor_block(p, dq); + p += 256; + q += 256; + dq += 256; + bytes -= 256; + } +} + + +const struct raid6_recov_calls raid6_recov_s390xc = { + .data2 = raid6_2data_recov_s390xc, + .datap = raid6_datap_recov_s390xc, + .valid = NULL, + .name = "s390xc", + .priority = 1, +}; diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc new file mode 100644 index 000000000000..7b45191a655f --- /dev/null +++ b/lib/raid6/s390vx.uc @@ -0,0 +1,168 @@ +/* + * raid6_vx$#.c + * + * $#-way unrolled RAID6 gen/xor functions for s390 + * based on the vector facility + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * + * This file is postprocessed using unroll.awk. + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +asm(".include \"asm/vx-insn.h\"\n"); + +#define NSIZE 16 + +static inline void LOAD_CONST(void) +{ + asm volatile("VREPIB %v24,7"); + asm volatile("VREPIB %v25,0x1d"); +} + +/* + * The SHLBYTE() operation shifts each of the 16 bytes in + * vector register y left by 1 bit and stores the result in + * vector register x. + */ +static inline void SHLBYTE(int x, int y) +{ + asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y)); +} + +/* + * For each of the 16 bytes in the vector register y the MASK() + * operation returns 0xFF if the high bit of the byte is 1, + * or 0x00 if the high bit is 0. The result is stored in vector + * register x. + */ +static inline void MASK(int x, int y) +{ + asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y)); +} + +static inline void AND(int x, int y, int z) +{ + asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); +} + +static inline void XOR(int x, int y, int z) +{ + asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); +} + +static inline void LOAD_DATA(int x, int n, u8 *ptr) +{ + typedef struct { u8 _[16*n]; } addrtype; + register addrtype *__ptr asm("1") = (addrtype *) ptr; + + asm volatile ("VLM %2,%3,0,%r1" + : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1)); +} + +static inline void STORE_DATA(int x, int n, u8 *ptr) +{ + typedef struct { u8 _[16*n]; } addrtype; + register addrtype *__ptr asm("1") = (addrtype *) ptr; + + asm volatile ("VSTM %2,%3,0,1" + : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1)); +} + +static inline void COPY_VEC(int x, int y) +{ + asm volatile ("VLR %0,%1" : : "i" (x), "i" (y)); +} + +static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + struct kernel_fpu vxstate; + u8 **dptr, *p, *q; + int d, z, z0; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + dptr = (u8 **) ptrs; + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += $#*NSIZE) { + LOAD_DATA(0,$#,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,$#,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + STORE_DATA(0,$#,&p[d]); + STORE_DATA(8,$#,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + struct kernel_fpu vxstate; + u8 **dptr, *p, *q; + int d, z, z0; + + dptr = (u8 **) ptrs; + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + for (d = 0; d < bytes; d += $#*NSIZE) { + /* P/Q data pages */ + LOAD_DATA(0,$#,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= start; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,$#,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + } + LOAD_DATA(16,$#,&p[d]); + XOR(16+$$,16+$$,0+$$); + STORE_DATA(16,$#,&p[d]); + LOAD_DATA(16,$#,&q[d]); + XOR(16+$$,16+$$,8+$$); + STORE_DATA(16,$#,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static int raid6_s390vx$#_valid(void) +{ + return MACHINE_HAS_VX; +} + +const struct raid6_calls raid6_s390vx$# = { + raid6_s390vx$#_gen_syndrome, + raid6_s390vx$#_xor_syndrome, + raid6_s390vx$#_valid, + "vx128x$#", + 1 +}; diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 29090f3db677..2c7b60edea04 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64) endif ifeq ($(IS_X86),yes) - OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ gcc -c -x assembler - >&/dev/null && \ rm ./-.o && echo -DCONFIG_AS_AVX2=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 3bebbabdb510..b07f4d8e6b03 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -21,12 +21,13 @@ #define NDISKS 16 /* Including P and Q */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); struct raid6_calls raid6_call; char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; +char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); static void makedata(int start, int stop) { diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index 8fe9d9662abb..834d268a4b05 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) + * Instructions + */ +#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) + * Instructions + */ +#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) + * Extensions + */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ diff --git a/lib/random32.c b/lib/random32.c index 510d1ce7d4d2..fa594b1140e6 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state); +static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. @@ -81,7 +81,7 @@ u32 prandom_u32(void) u32 res; res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(net_rand_state); return res; } @@ -128,7 +128,7 @@ void prandom_bytes(void *buf, size_t bytes) struct rnd_state *state = &get_cpu_var(net_rand_state); prandom_bytes_state(state, buf, bytes); - put_cpu_var(state); + put_cpu_var(net_rand_state); } EXPORT_SYMBOL(prandom_bytes); @@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare) static void __init __prandom_start_seed_timer(void) { - set_timer_slack(&seed_timer, HZ); seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 2c5de86460c5..08f8043cac61 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -46,12 +46,14 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + rs->interval)) { - if (rs->missed) - printk(KERN_WARNING "%s: %d callbacks suppressed\n", - func, rs->missed); + if (rs->missed) { + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + rs->missed = 0; + } + } rs->begin = jiffies; rs->printed = 0; - rs->missed = 0; } if (rs->burst && rs->burst > rs->printed) { rs->printed++; diff --git a/lib/rbtree.c b/lib/rbtree.c index 1356454e36de..eb8a19fee110 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -539,17 +539,39 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, { struct rb_node *parent = rb_parent(victim); + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + /* Set the surrounding nodes to point to the replacement */ - __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} +EXPORT_SYMBOL(rb_replace_node); + +void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; + + /* Set the surrounding nodes to point to the replacement */ + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Set the parent's pointer to the new node last after an RCU barrier + * so that the pointers onwards are seen to be set correctly when doing + * an RCU walk over the tree. + */ + __rb_change_child_rcu(victim, new, parent, root); } -EXPORT_SYMBOL(rb_replace_node); +EXPORT_SYMBOL(rb_replace_node_rcu); static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 5d845ffd7982..32d0ad058380 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -30,7 +30,7 @@ #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U -#define BUCKET_LOCKS_PER_CPU 128UL +#define BUCKET_LOCKS_PER_CPU 32UL static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, @@ -70,21 +70,25 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, unsigned int nr_pcpus = num_possible_cpus(); #endif - nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); + nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { + tbl->locks = NULL; #ifdef CONFIG_NUMA if (size * sizeof(spinlock_t) > PAGE_SIZE && gfp == GFP_KERNEL) tbl->locks = vmalloc(size * sizeof(spinlock_t)); - else #endif - tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - gfp); + if (gfp != GFP_KERNEL) + gfp |= __GFP_NOWARN | __GFP_NORETRY; + + if (!tbl->locks) + tbl->locks = kmalloc_array(size, sizeof(spinlock_t), + gfp); if (!tbl->locks) return -ENOMEM; for (i = 0; i < size; i++) @@ -321,12 +325,14 @@ static int rhashtable_expand(struct rhashtable *ht) static int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); - unsigned int size; + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; int err; ASSERT_RHT_MUTEX(ht); - size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2); + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; @@ -372,22 +378,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -433,61 +425,172 @@ fail: return err; } -EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + elasticity = ht->elasticity; + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + return NULL; + } - err = 0; + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return NULL; +} - if (err == 0) - return NULL; - else if (err == -EAGAIN) - return tbl; - else - return ERR_PTR(err); +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** - * rhashtable_walk_init - Initialise an iterator + * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator - * @gfp: GFP flags for allocations * * This function prepares a hash table walk. * @@ -502,30 +605,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * This function may sleep so you must not call it from interrupt * context or with spin locks held. * - * You must call rhashtable_walk_exit if this function returns - * successfully. + * You must call rhashtable_walk_exit after this function returns. */ -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp) +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; - iter->walker = kmalloc(sizeof(*iter->walker), gfp); - if (!iter->walker) - return -ENOMEM; - spin_lock(&ht->lock); - iter->walker->tbl = + iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); - list_add(&iter->walker->list, &iter->walker->tbl->walkers); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); - - return 0; } -EXPORT_SYMBOL_GPL(rhashtable_walk_init); +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator @@ -536,10 +631,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); - kfree(iter->walker); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); @@ -565,12 +659,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter) rcu_read_lock(); spin_lock(&ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&ht->lock); - if (!iter->walker->tbl) { - iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -592,12 +686,17 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; if (p) { - p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } goto next; } @@ -605,6 +704,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter) int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } if (!skip) break; skip--; @@ -614,7 +725,8 @@ next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; - return rht_obj(ht, p); + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; @@ -625,8 +737,8 @@ next: /* Ensure we see any new tables. */ smp_rmb(); - iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (iter->walker->tbl) { + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); @@ -646,7 +758,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; @@ -655,9 +767,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) spin_lock(&ht->lock); if (tbl->rehash < tbl->size) - list_add(&iter->walker->list, &tbl->walkers); + list_add(&iter->walker.list, &tbl->walkers); else - iter->walker->tbl = NULL; + iter->walker.tbl = NULL; spin_unlock(&ht->lock); iter->p = NULL; @@ -803,6 +915,48 @@ int rhashtable_init(struct rhashtable *ht, EXPORT_SYMBOL_GPL(rhashtable_init); /** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element @@ -839,7 +993,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) - free_fn(rht_obj(ht, pos), arg); + rhashtable_free_one(ht, pos, free_fn, arg); } } diff --git a/lib/sbitmap.c b/lib/sbitmap.c new file mode 100644 index 000000000000..2cecf05c82fd --- /dev/null +++ b/lib/sbitmap.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <linux/random.h> +#include <linux/sbitmap.h> + +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node) +{ + unsigned int bits_per_word; + unsigned int i; + + if (shift < 0) { + shift = ilog2(BITS_PER_LONG); + /* + * If the bitmap is small, shrink the number of bits per word so + * we spread over a few cachelines, at least. If less than 4 + * bits, just forget about it, it's not going to work optimally + * anyway. + */ + if (depth >= 4) { + while ((4U << shift) > depth) + shift--; + } + } + bits_per_word = 1U << shift; + if (bits_per_word > BITS_PER_LONG) + return -EINVAL; + + sb->shift = shift; + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + if (depth == 0) { + sb->map = NULL; + return 0; + } + + sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); + if (!sb->map) + return -ENOMEM; + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_init_node); + +void sbitmap_resize(struct sbitmap *sb, unsigned int depth) +{ + unsigned int bits_per_word = 1U << sb->shift; + unsigned int i; + + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } +} +EXPORT_SYMBOL_GPL(sbitmap_resize); + +static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, + bool wrap) +{ + unsigned int orig_hint = hint; + int nr; + + while (1) { + nr = find_next_zero_bit(&word->word, word->depth, hint); + if (unlikely(nr >= word->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (orig_hint && hint && wrap) { + hint = orig_hint = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(nr, &word->word)) + break; + + hint = nr + 1; + if (hint >= word->depth - 1) + hint = 0; + } + + return nr; +} + +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) +{ + unsigned int i, index; + int nr = -1; + + index = SB_NR_TO_INDEX(sb, alloc_hint); + + for (i = 0; i < sb->map_nr; i++) { + nr = __sbitmap_get_word(&sb->map[index], + SB_NR_TO_BIT(sb, alloc_hint), + !round_robin); + if (nr != -1) { + nr += index << sb->shift; + break; + } + + /* Jump to next index. */ + index++; + alloc_hint = index << sb->shift; + + if (index >= sb->map_nr) { + index = 0; + alloc_hint = 0; + } + } + + return nr; +} +EXPORT_SYMBOL_GPL(sbitmap_get); + +bool sbitmap_any_bit_set(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + if (sb->map[i].word) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); + +bool sbitmap_any_bit_clear(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + unsigned long ret; + + ret = find_first_zero_bit(&word->word, word->depth); + if (ret < word->depth) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); + +unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + unsigned int i, weight = 0; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + + weight += bitmap_weight(&word->word, word->depth); + } + return weight; +} +EXPORT_SYMBOL_GPL(sbitmap_weight); + +static unsigned int sbq_calc_wake_batch(unsigned int depth) +{ + unsigned int wake_batch; + + /* + * For each batch, we wake up one queue. We need to make sure that our + * batch size is small enough that the full depth of the bitmap is + * enough to wake up all of the queues. + */ + wake_batch = SBQ_WAKE_BATCH; + if (wake_batch > depth / SBQ_WAIT_QUEUES) + wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); + + return wake_batch; +} + +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, bool round_robin, gfp_t flags, int node) +{ + int ret; + int i; + + ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node); + if (ret) + return ret; + + sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); + if (!sbq->alloc_hint) { + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + if (depth && !round_robin) { + for_each_possible_cpu(i) + *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; + } + + sbq->wake_batch = sbq_calc_wake_batch(depth); + atomic_set(&sbq->wake_index, 0); + + sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); + if (!sbq->ws) { + free_percpu(sbq->alloc_hint); + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + init_waitqueue_head(&sbq->ws[i].wait); + atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); + } + + sbq->round_robin = round_robin; + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); + +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +{ + sbq->wake_batch = sbq_calc_wake_batch(depth); + sbitmap_resize(&sbq->sb, depth); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_resize); + +int __sbitmap_queue_get(struct sbitmap_queue *sbq) +{ + unsigned int hint, depth; + int nr; + + hint = this_cpu_read(*sbq->alloc_hint); + depth = READ_ONCE(sbq->sb.depth); + if (unlikely(hint >= depth)) { + hint = depth ? prandom_u32() % depth : 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); + + if (nr == -1) { + /* If the map is full, a hint won't do us much good. */ + this_cpu_write(*sbq->alloc_hint, 0); + } else if (nr == hint || unlikely(sbq->round_robin)) { + /* Only update the hint if we used it. */ + hint = nr + 1; + if (hint >= depth - 1) + hint = 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + + return nr; +} +EXPORT_SYMBOL_GPL(__sbitmap_queue_get); + +static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) { + int o = atomic_read(&sbq->wake_index); + + if (wake_index != o) + atomic_cmpxchg(&sbq->wake_index, o, wake_index); + return ws; + } + + wake_index = sbq_index_inc(wake_index); + } + + return NULL; +} + +static void sbq_wake_up(struct sbitmap_queue *sbq) +{ + struct sbq_wait_state *ws; + int wait_cnt; + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); + + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + + wait_cnt = atomic_dec_return(&ws->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&ws->wait_cnt); + if (wait_cnt == 0) { + atomic_add(sbq->wake_batch, &ws->wait_cnt); + sbq_index_atomic_inc(&sbq->wake_index); + wake_up(&ws->wait); + } +} + +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + unsigned int cpu) +{ + sbitmap_clear_bit(&sbq->sb, nr); + sbq_wake_up(sbq); + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) + *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_clear); + +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) + wake_up(&ws->wait); + + wake_index = sbq_index_inc(wake_index); + } +} +EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 53ad6c0831ae..f87d138e9672 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -50,7 +50,7 @@ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) -#define STACK_ALLOC_SLABS_CAP 1024 +#define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) @@ -192,6 +192,7 @@ void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) trace->entries = stack->entries; trace->skip = 0; } +EXPORT_SYMBOL_GPL(depot_fetch_stack); /** * depot_save_stack - save stack in a stack depot. @@ -242,6 +243,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace, */ alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); + alloc_flags |= __GFP_NOWARN; page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); if (page) prealloc = page_address(page); @@ -282,3 +284,4 @@ exit: fast_exit: return retval; } +EXPORT_SYMBOL_GPL(depot_save_stack); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33f655ef48cd..7e35fc450c5b 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,6 +1,7 @@ #include <linux/compiler.h> #include <linux/export.h> #include <linux/kasan-checks.h> +#include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -40,8 +41,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -56,8 +57,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -76,6 +76,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } @@ -111,6 +112,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) long retval; kasan_check_write(dst, count); + check_object_size(dst, count, false); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 76f29ecba8f4..22e13a0e19d7 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -738,7 +738,7 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); @@ -807,7 +807,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unmap_single(hwdev, dev_addr, size, dir); } @@ -877,7 +877,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_device); */ int swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; @@ -914,7 +914,7 @@ int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); + return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0); } EXPORT_SYMBOL(swiotlb_map_sg); @@ -924,7 +924,8 @@ EXPORT_SYMBOL(swiotlb_map_sg); */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) + int nelems, enum dma_data_direction dir, + unsigned long attrs) { struct scatterlist *sg; int i; @@ -941,7 +942,7 @@ void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); + return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0); } EXPORT_SYMBOL(swiotlb_unmap_sg); diff --git a/lib/syscall.c b/lib/syscall.c index e30e03932480..63239e097b13 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc) { - struct pt_regs *regs = task_pt_regs(target); - if (unlikely(!regs)) + struct pt_regs *regs; + + if (!try_get_task_stack(target)) { + /* Task has no stack, so the task isn't in a syscall. */ + *callno = -1; + return 0; + } + + regs = task_pt_regs(target); + if (unlikely(!regs)) { + put_task_stack(target); return -EAGAIN; + } *sp = user_stack_pointer(regs); *pc = instruction_pointer(regs); @@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno, if (*callno != -1L && maxargs > 0) syscall_get_arguments(target, regs, 0, maxargs, args); + put_task_stack(target); return 0; } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 93f45011a59d..0362da0b66c3 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4831,7 +4831,7 @@ static struct bpf_test tests[] = { { }, INTERNAL, { 0x34 }, - { { 1, 0xbef } }, + { { ETH_HLEN, 0xbef } }, .fill_helper = bpf_fill_ld_abs_vlan_push_pop, }, /* @@ -5485,6 +5485,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; + skb->vlan_proto = htons(ETH_P_IP); skb->dev = &dev; skb->dev->ifindex = SKB_DEV_IFINDEX; skb->dev->type = SKB_DEV_TYPE; diff --git a/lib/test_hash.c b/lib/test_hash.c index c9549c8b4909..cac20c5fb304 100644 --- a/lib/test_hash.c +++ b/lib/test_hash.c @@ -143,7 +143,7 @@ static int __init test_hash_init(void) { char buf[SIZE+1]; - u32 string_or = 0, hash_or[2][33] = { 0 }; + u32 string_or = 0, hash_or[2][33] = { { 0, } }; unsigned tests = 0; unsigned long long h64 = 0; int i, j; @@ -155,8 +155,8 @@ test_hash_init(void) buf[j] = '\0'; for (i = 0; i <= j; i++) { - u64 hashlen = hashlen_string(buf+i); - u32 h0 = full_name_hash(buf+i, j-i); + u64 hashlen = hashlen_string(buf+i, buf+i); + u32 h0 = full_name_hash(buf+i, buf+i, j-i); /* Check that hashlen_string gets the length right */ if (hashlen_len(hashlen) != j-i) { @@ -219,21 +219,27 @@ test_hash_init(void) } /* Issue notices about skipped tests. */ -#ifndef HAVE_ARCH__HASH_32 - pr_info("__hash_32() has no arch implementation to test."); -#elif HAVE_ARCH__HASH_32 != 1 +#ifdef HAVE_ARCH__HASH_32 +#if HAVE_ARCH__HASH_32 != 1 pr_info("__hash_32() is arch-specific; not compared to generic."); #endif -#ifndef HAVE_ARCH_HASH_32 - pr_info("hash_32() has no arch implementation to test."); -#elif HAVE_ARCH_HASH_32 != 1 +#else + pr_info("__hash_32() has no arch implementation to test."); +#endif +#ifdef HAVE_ARCH_HASH_32 +#if HAVE_ARCH_HASH_32 != 1 pr_info("hash_32() is arch-specific; not compared to generic."); #endif -#ifndef HAVE_ARCH_HASH_64 - pr_info("hash_64() has no arch implementation to test."); -#elif HAVE_ARCH_HASH_64 != 1 +#else + pr_info("hash_32() has no arch implementation to test."); +#endif +#ifdef HAVE_ARCH_HASH_64 +#if HAVE_ARCH_HASH_64 != 1 pr_info("hash_64() is arch-specific; not compared to generic."); #endif +#else + pr_info("hash_64() has no arch implementation to test."); +#endif pr_notice("%u tests passed.", tests); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 297fdb5e74bd..64e899b63337 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -38,7 +38,7 @@ MODULE_PARM_DESC(runs, "Number of test runs per variant (default: 4)"); static int max_size = 0; module_param(max_size, int, 0); -MODULE_PARM_DESC(runs, "Maximum table size (default: calculated)"); +MODULE_PARM_DESC(max_size, "Maximum table size (default: calculated)"); static bool shrinking = false; module_param(shrinking, bool, 0); diff --git a/lib/ubsan.c b/lib/ubsan.c index 8799ae5e2e42..fb0409df1bcf 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -308,7 +308,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data *data, return; ubsan_prologue(&data->location, &flags); - pr_err("%s address %pk with insufficient space\n", + pr_err("%s address %p with insufficient space\n", type_check_kinds[data->type_check_kind], (void *) ptr); pr_err("for an object of type %s\n", data->type->type_name); diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index f0b323abb4c6..ae8d2491133c 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -56,7 +56,7 @@ ucs2_utf8size(const ucs2_char_t *src) unsigned long i; unsigned long j = 0; - for (i = 0; i < ucs2_strlen(src); i++) { + for (i = 0; src[i]; i++) { u16 c = src[i]; if (c >= 0x800) diff --git a/lib/usercopy.c b/lib/usercopy.c deleted file mode 100644 index 4f5b1ddbcd25..000000000000 --- a/lib/usercopy.c +++ /dev/null @@ -1,9 +0,0 @@ -#include <linux/export.h> -#include <linux/bug.h> -#include <linux/uaccess.h> - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/lib/win_minmax.c b/lib/win_minmax.c new file mode 100644 index 000000000000..c8420d404926 --- /dev/null +++ b/lib/win_minmax.c @@ -0,0 +1,98 @@ +/** + * lib/minmax.c: windowed min/max tracker + * + * Kathleen Nichols' algorithm for tracking the minimum (or maximum) + * value of a data stream over some fixed time interval. (E.g., + * the minimum RTT over the past five minutes.) It uses constant + * space and constant time per update yet almost always delivers + * the same minimum as an implementation that has to keep all the + * data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of + * the n'th best >= n-1'th best. It also makes sure that the three + * values are widely separated in the time window since that bounds + * the worse case error when that data is monotonically increasing + * over the window. + * + * Upon getting a new min, we can forget everything earlier because + * it has no value - the new min is <= everything else in the window + * by definition and it's the most recent. So we restart fresh on + * every new min and overwrites 2nd & 3rd choices. The same property + * holds for 2nd & 3rd best. + */ +#include <linux/module.h> +#include <linux/win_minmax.h> + +/* As time advances, update the 1st, 2nd, and 3rd choices. */ +static u32 minmax_subwin_update(struct minmax *m, u32 win, + const struct minmax_sample *val) +{ + u32 dt = val->t - m->s[0].t; + + if (unlikely(dt > win)) { + /* + * Passed entire window without a new val so make 2nd + * choice the new val & 3rd choice the new 2nd choice. + * we may have to iterate this since our 2nd choice + * may also be outside the window (we checked on entry + * that the third choice was in the window). + */ + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + if (unlikely(val->t - m->s[0].t > win)) { + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + } + } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { + /* + * We've passed a quarter of the window without a new val + * so take a 2nd choice from the 2nd quarter of the window. + */ + m->s[2] = m->s[1] = *val; + } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { + /* + * We've passed half the window without finding a new val + * so take a 3rd choice from the last half of the window + */ + m->s[2] = *val; + } + return m->s[0].v; +} + +/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v >= m->s[0].v) || /* found new max? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v >= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v >= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} +EXPORT_SYMBOL(minmax_running_max); + +/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v <= m->s[0].v) || /* found new min? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v <= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v <= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} |