From 6070bf3596f3b5a54091a08d5b2bc90c143dc264 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 8 Nov 2010 11:20:49 +0900 Subject: kernel: Constify temporary variable in roundup() Fix build error with GCC 3.x caused by commit b28efd54 "kernel: roundup should only reference arguments once" by constifying temporary variable used in that macro. Signed-off-by: Tetsuo Handa Suggested-by: Andrew Morton Acked-by: Eric Paris Signed-off-by: James Morris --- include/linux/kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 450092c1e35f..b526947bdf48 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -60,7 +60,7 @@ extern const char linux_proc_banner[]; #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define roundup(x, y) ( \ { \ - typeof(y) __y = y; \ + const typeof(y) __y = y; \ (((x) + (__y - 1)) / __y) * __y; \ } \ ) -- cgit v1.2.3 From 074e61ec3751da9ab88ee66d3818574556c03489 Mon Sep 17 00:00:00 2001 From: James Morris Date: Wed, 10 Nov 2010 09:01:31 +1100 Subject: kernel: add roundup() code comment from akpm Add roundup() code comment from akpm. Signed-off-by: Andrew Morton Signed-off-by: James Morris --- include/linux/kernel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b526947bdf48..3f648d204c37 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -58,6 +58,8 @@ extern const char linux_proc_banner[]; #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) + +/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */ #define roundup(x, y) ( \ { \ const typeof(y) __y = y; \ -- cgit v1.2.3 From eaf06b241b091357e72b76863ba16e89610d31bd Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Thu, 11 Nov 2010 14:05:18 -0800 Subject: Restrict unprivileged access to kernel syslog The kernel syslog contains debugging information that is often useful during exploitation of other vulnerabilities, such as kernel heap addresses. Rather than futilely attempt to sanitize hundreds (or thousands) of printk statements and simultaneously cripple useful debugging functionality, it is far simpler to create an option that prevents unprivileged users from reading the syslog. This patch, loosely based on grsecurity's GRKERNSEC_DMESG, creates the dmesg_restrict sysctl. When set to "0", the default, no restrictions are enforced. When set to "1", only users with CAP_SYS_ADMIN can read the kernel syslog via dmesg(8) or other mechanisms. [akpm@linux-foundation.org: explain the config option in kernel.txt] Signed-off-by: Dan Rosenberg Acked-by: Ingo Molnar Acked-by: Eugene Teo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 14 ++++++++++++++ include/linux/kernel.h | 1 + kernel/printk.c | 6 ++++++ kernel/sysctl.c | 9 +++++++++ security/Kconfig | 12 ++++++++++++ security/commoncap.c | 2 ++ 6 files changed, 44 insertions(+) (limited to 'include/linux/kernel.h') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 3894eaa23486..209e1584c3dc 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -28,6 +28,7 @@ show up in /proc/sys/kernel: - core_uses_pid - ctrl-alt-del - dentry-state +- dmesg_restrict - domainname - hostname - hotplug @@ -213,6 +214,19 @@ to decide what to do with it. ============================================================== +dmesg_restrict: + +This toggle indicates whether unprivileged users are prevented from using +dmesg(8) to view messages from the kernel's log buffer. When +dmesg_restrict is set to (0) there are no restrictions. When +dmesg_restrict is set set to (1), users must have CAP_SYS_ADMIN to use +dmesg(8). + +The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the default +value of dmesg_restrict. + +============================================================== + domainname & hostname: These files can be used to set the NIS/YP domainname and the diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b526947bdf48..fc3da9e4da19 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -293,6 +293,7 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); extern int printk_delay_msec; +extern int dmesg_restrict; /* * Print a one-time message (analogous to WARN_ONCE() et al): diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c377..38e7d5868d60 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void) } #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799f..b65bf634035e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -703,6 +703,15 @@ static struct ctl_table kern_table[] = { .extra2 = &ten_thousand, }, #endif + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ngroups_max", .data = &ngroups_max, diff --git a/security/Kconfig b/security/Kconfig index bd72ae623494..e80da955e687 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -39,6 +39,18 @@ config KEYS_DEBUG_PROC_KEYS If you are unsure as to whether this is required, answer N. +config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" + default n + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). + + If this option is not selected, no restrictions will be enforced + unless the dmesg_restrict sysctl is explicitly set to (1). + + If you are unsure how to answer this question, answer N. + config SECURITY bool "Enable different security models" depends on SYSFS diff --git a/security/commoncap.c b/security/commoncap.c index 5e632b4857e4..04b80f9912bf 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -895,6 +895,8 @@ int cap_syslog(int type, bool from_file) { if (type != SYSLOG_ACTION_OPEN && from_file) return 0; + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; if ((type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3 From 968ab1838a5d48f02f5b471aa1d0e59e2cc2ccbc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 15 Nov 2010 13:37:37 -0800 Subject: include/linux/kernel.h: Move logging bits to include/linux/printk.h Move the logging bits from kernel.h into printk.h so that there is a bit more logical separation of the generic from the printk logging specific parts. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 245 +----------------------------------------------- include/linux/printk.h | 248 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+), 244 deletions(-) create mode 100644 include/linux/printk.h (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fc3da9e4da19..b6de9a6f7018 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -17,13 +17,11 @@ #include #include #include +#include #include #include #include -extern const char linux_banner[]; -extern const char linux_proc_banner[]; - #define USHRT_MAX ((u16)(~0U)) #define SHRT_MAX ((s16)(USHRT_MAX>>1)) #define SHRT_MIN ((s16)(-SHRT_MAX - 1)) @@ -110,31 +108,6 @@ extern const char linux_proc_banner[]; */ #define lower_32_bits(n) ((u32)(n)) -#define KERN_EMERG "<0>" /* system is unusable */ -#define KERN_ALERT "<1>" /* action must be taken immediately */ -#define KERN_CRIT "<2>" /* critical conditions */ -#define KERN_ERR "<3>" /* error conditions */ -#define KERN_WARNING "<4>" /* warning conditions */ -#define KERN_NOTICE "<5>" /* normal but significant condition */ -#define KERN_INFO "<6>" /* informational */ -#define KERN_DEBUG "<7>" /* debug-level messages */ - -/* Use the default kernel loglevel */ -#define KERN_DEFAULT "" -/* - * Annotation for a "continued" line of log printout (only done after a - * line that had no enclosing \n). Only to be used by core/arch code - * during early bootup (a continued line is not SMP-safe otherwise). - */ -#define KERN_CONT "" - -extern int console_printk[]; - -#define console_loglevel (console_printk[0]) -#define default_message_loglevel (console_printk[1]) -#define minimum_console_loglevel (console_printk[2]) -#define default_console_loglevel (console_printk[3]) - struct completion; struct pt_regs; struct user; @@ -187,11 +160,6 @@ static inline void might_fault(void) } #endif -struct va_format { - const char *fmt; - va_list *va; -}; - extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); NORET_TYPE void panic(const char * fmt, ...) @@ -245,115 +213,8 @@ extern int func_ptr_is_kernel_text(void *ptr); struct pid; extern struct pid *session_of_pgrp(struct pid *pgrp); -/* - * FW_BUG - * Add this to a message where you are sure the firmware is buggy or behaves - * really stupid or out of spec. Be aware that the responsible BIOS developer - * should be able to fix this issue or at least get a concrete idea of the - * problem by reading your message without the need of looking at the kernel - * code. - * - * Use it for definite and high priority BIOS bugs. - * - * FW_WARN - * Use it for not that clear (e.g. could the kernel messed up things already?) - * and medium priority BIOS bugs. - * - * FW_INFO - * Use this one if you want to tell the user or vendor about something - * suspicious, but generally harmless related to the firmware. - * - * Use it for information or very low priority BIOS bugs. - */ -#define FW_BUG "[Firmware Bug]: " -#define FW_WARN "[Firmware Warn]: " -#define FW_INFO "[Firmware Info]: " - -/* - * HW_ERR - * Add this to a message for hardware errors, so that user can report - * it to hardware vendor instead of LKML or software vendor. - */ -#define HW_ERR "[Hardware Error]: " - -#ifdef CONFIG_PRINTK -asmlinkage int vprintk(const char *fmt, va_list args) - __attribute__ ((format (printf, 1, 0))); -asmlinkage int printk(const char * fmt, ...) - __attribute__ ((format (printf, 1, 2))) __cold; - -/* - * Please don't use printk_ratelimit(), because it shares ratelimiting state - * with all other unrelated printk_ratelimit() callsites. Instead use - * printk_ratelimited() or plain old __ratelimit(). - */ -extern int __printk_ratelimit(const char *func); -#define printk_ratelimit() __printk_ratelimit(__func__) -extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, - unsigned int interval_msec); - -extern int printk_delay_msec; -extern int dmesg_restrict; - -/* - * Print a one-time message (analogous to WARN_ONCE() et al): - */ -#define printk_once(x...) ({ \ - static bool __print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - printk(x); \ - } \ -}) - -void log_buf_kexec_setup(void); -#else -static inline int vprintk(const char *s, va_list args) - __attribute__ ((format (printf, 1, 0))); -static inline int vprintk(const char *s, va_list args) { return 0; } -static inline int printk(const char *s, ...) - __attribute__ ((format (printf, 1, 2))); -static inline int __cold printk(const char *s, ...) { return 0; } -static inline int printk_ratelimit(void) { return 0; } -static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ - unsigned int interval_msec) \ - { return false; } - -/* No effect, but we still get type checking even in the !PRINTK case: */ -#define printk_once(x...) printk(x) - -static inline void log_buf_kexec_setup(void) -{ -} -#endif - -/* - * Dummy printk for disabled debugging statements to use whilst maintaining - * gcc's format and side-effect checking. - */ -static inline __attribute__ ((format (printf, 1, 2))) -int no_printk(const char *s, ...) { return 0; } - -extern int printk_needs_cpu(int cpu); -extern void printk_tick(void); - -extern void asmlinkage __attribute__((format(printf, 1, 2))) - early_printk(const char *fmt, ...); - unsigned long int_sqrt(unsigned long); -static inline void console_silent(void) -{ - console_loglevel = 0; -} - -static inline void console_verbose(void) -{ - if (console_loglevel) - console_loglevel = 15; -} - extern void bust_spinlocks(int yes); extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ @@ -390,22 +251,6 @@ extern enum system_states { #define TAINT_CRAP 10 #define TAINT_FIRMWARE_WORKAROUND 11 -extern void dump_stack(void) __cold; - -enum { - DUMP_PREFIX_NONE, - DUMP_PREFIX_ADDRESS, - DUMP_PREFIX_OFFSET -}; -extern void hex_dump_to_buffer(const void *buf, size_t len, - int rowsize, int groupsize, - char *linebuf, size_t linebuflen, bool ascii); -extern void print_hex_dump(const char *level, const char *prefix_str, - int prefix_type, int rowsize, int groupsize, - const void *buf, size_t len, bool ascii); -extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len); - extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] #define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] @@ -419,94 +264,6 @@ static inline char *pack_hex_byte(char *buf, u8 byte) extern int hex_to_bin(char ch); -#ifndef pr_fmt -#define pr_fmt(fmt) fmt -#endif - -#define pr_emerg(fmt, ...) \ - printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) \ - printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) \ - printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) \ - printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warning(fmt, ...) \ - printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn pr_warning -#define pr_notice(fmt, ...) \ - printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) \ - printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -#define pr_cont(fmt, ...) \ - printk(KERN_CONT fmt, ##__VA_ARGS__) - -/* pr_devel() should produce zero code unless DEBUG is defined */ -#ifdef DEBUG -#define pr_devel(fmt, ...) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) -#else -#define pr_devel(fmt, ...) \ - ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) -#endif - -/* If you are writing a driver, please use dev_dbg instead */ -#if defined(DEBUG) -#define pr_debug(fmt, ...) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) -#elif defined(CONFIG_DYNAMIC_DEBUG) -/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ -#define pr_debug(fmt, ...) \ - dynamic_pr_debug(fmt, ##__VA_ARGS__) -#else -#define pr_debug(fmt, ...) \ - ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) -#endif - -/* - * ratelimited messages with local ratelimit_state, - * no local ratelimit_state used in the !PRINTK case - */ -#ifdef CONFIG_PRINTK -#define printk_ratelimited(fmt, ...) ({ \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - printk(fmt, ##__VA_ARGS__); \ -}) -#else -/* No effect, but we still get type checking even in the !PRINTK case: */ -#define printk_ratelimited printk -#endif - -#define pr_emerg_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warning_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn_ratelimited pr_warning_ratelimited -#define pr_notice_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -/* no pr_cont_ratelimited, don't do that... */ -/* If you are writing a driver, please use dev_dbg instead */ -#if defined(DEBUG) -#define pr_debug_ratelimited(fmt, ...) \ - printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) -#else -#define pr_debug_ratelimited(fmt, ...) \ - ({ if (0) printk_ratelimited(KERN_DEBUG pr_fmt(fmt), \ - ##__VA_ARGS__); 0; }) -#endif - /* * General tracing related utility functions - trace_printk(), * tracing_on/tracing_off and tracing_start()/tracing_stop diff --git a/include/linux/printk.h b/include/linux/printk.h new file mode 100644 index 000000000000..b772ca5fbdf0 --- /dev/null +++ b/include/linux/printk.h @@ -0,0 +1,248 @@ +#ifndef __KERNEL_PRINTK__ +#define __KERNEL_PRINTK__ + +extern const char linux_banner[]; +extern const char linux_proc_banner[]; + +#define KERN_EMERG "<0>" /* system is unusable */ +#define KERN_ALERT "<1>" /* action must be taken immediately */ +#define KERN_CRIT "<2>" /* critical conditions */ +#define KERN_ERR "<3>" /* error conditions */ +#define KERN_WARNING "<4>" /* warning conditions */ +#define KERN_NOTICE "<5>" /* normal but significant condition */ +#define KERN_INFO "<6>" /* informational */ +#define KERN_DEBUG "<7>" /* debug-level messages */ + +/* Use the default kernel loglevel */ +#define KERN_DEFAULT "" +/* + * Annotation for a "continued" line of log printout (only done after a + * line that had no enclosing \n). Only to be used by core/arch code + * during early bootup (a continued line is not SMP-safe otherwise). + */ +#define KERN_CONT "" + +extern int console_printk[]; + +#define console_loglevel (console_printk[0]) +#define default_message_loglevel (console_printk[1]) +#define minimum_console_loglevel (console_printk[2]) +#define default_console_loglevel (console_printk[3]) + +struct va_format { + const char *fmt; + va_list *va; +}; + +/* + * FW_BUG + * Add this to a message where you are sure the firmware is buggy or behaves + * really stupid or out of spec. Be aware that the responsible BIOS developer + * should be able to fix this issue or at least get a concrete idea of the + * problem by reading your message without the need of looking at the kernel + * code. + * + * Use it for definite and high priority BIOS bugs. + * + * FW_WARN + * Use it for not that clear (e.g. could the kernel messed up things already?) + * and medium priority BIOS bugs. + * + * FW_INFO + * Use this one if you want to tell the user or vendor about something + * suspicious, but generally harmless related to the firmware. + * + * Use it for information or very low priority BIOS bugs. + */ +#define FW_BUG "[Firmware Bug]: " +#define FW_WARN "[Firmware Warn]: " +#define FW_INFO "[Firmware Info]: " + +/* + * HW_ERR + * Add this to a message for hardware errors, so that user can report + * it to hardware vendor instead of LKML or software vendor. + */ +#define HW_ERR "[Hardware Error]: " + +#ifdef CONFIG_PRINTK +asmlinkage int vprintk(const char *fmt, va_list args) + __attribute__ ((format (printf, 1, 0))); +asmlinkage int printk(const char * fmt, ...) + __attribute__ ((format (printf, 1, 2))) __cold; + +/* + * Please don't use printk_ratelimit(), because it shares ratelimiting state + * with all other unrelated printk_ratelimit() callsites. Instead use + * printk_ratelimited() or plain old __ratelimit(). + */ +extern int __printk_ratelimit(const char *func); +#define printk_ratelimit() __printk_ratelimit(__func__) +extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msec); + +extern int printk_delay_msec; +extern int dmesg_restrict; + +/* + * Print a one-time message (analogous to WARN_ONCE() et al): + */ +#define printk_once(x...) ({ \ + static bool __print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + printk(x); \ + } \ +}) + +void log_buf_kexec_setup(void); +#else +static inline int vprintk(const char *s, va_list args) + __attribute__ ((format (printf, 1, 0))); +static inline int vprintk(const char *s, va_list args) { return 0; } +static inline int printk(const char *s, ...) + __attribute__ ((format (printf, 1, 2))); +static inline int __cold printk(const char *s, ...) { return 0; } +static inline int printk_ratelimit(void) { return 0; } +static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ + unsigned int interval_msec) \ + { return false; } + +/* No effect, but we still get type checking even in the !PRINTK case: */ +#define printk_once(x...) printk(x) + +static inline void log_buf_kexec_setup(void) +{ +} +#endif + +/* + * Dummy printk for disabled debugging statements to use whilst maintaining + * gcc's format and side-effect checking. + */ +static inline __attribute__ ((format (printf, 1, 2))) +int no_printk(const char *s, ...) { return 0; } + +extern int printk_needs_cpu(int cpu); +extern void printk_tick(void); + +extern void asmlinkage __attribute__((format(printf, 1, 2))) + early_printk(const char *fmt, ...); + +static inline void console_silent(void) +{ + console_loglevel = 0; +} + +static inline void console_verbose(void) +{ + if (console_loglevel) + console_loglevel = 15; +} + +extern void dump_stack(void) __cold; + +enum { + DUMP_PREFIX_NONE, + DUMP_PREFIX_ADDRESS, + DUMP_PREFIX_OFFSET +}; +extern void hex_dump_to_buffer(const void *buf, size_t len, + int rowsize, int groupsize, + char *linebuf, size_t linebuflen, bool ascii); +extern void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); +extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, + const void *buf, size_t len); + +#ifndef pr_fmt +#define pr_fmt(fmt) fmt +#endif + +#define pr_emerg(fmt, ...) \ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) \ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) \ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) \ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning(fmt, ...) \ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn pr_warning +#define pr_notice(fmt, ...) \ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +#define pr_cont(fmt, ...) \ + printk(KERN_CONT fmt, ##__VA_ARGS__) + +/* pr_devel() should produce zero code unless DEBUG is defined */ +#ifdef DEBUG +#define pr_devel(fmt, ...) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#else +#define pr_devel(fmt, ...) \ + ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) +#endif + +/* If you are writing a driver, please use dev_dbg instead */ +#if defined(DEBUG) +#define pr_debug(fmt, ...) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#elif defined(CONFIG_DYNAMIC_DEBUG) +/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ +#define pr_debug(fmt, ...) \ + dynamic_pr_debug(fmt, ##__VA_ARGS__) +#else +#define pr_debug(fmt, ...) \ + ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) +#endif + +/* + * ratelimited messages with local ratelimit_state, + * no local ratelimit_state used in the !PRINTK case + */ +#ifdef CONFIG_PRINTK +#define printk_ratelimited(fmt, ...) ({ \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + printk(fmt, ##__VA_ARGS__); \ +}) +#else +/* No effect, but we still get type checking even in the !PRINTK case: */ +#define printk_ratelimited printk +#endif + +#define pr_emerg_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn_ratelimited pr_warning_ratelimited +#define pr_notice_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +/* no pr_cont_ratelimited, don't do that... */ +/* If you are writing a driver, please use dev_dbg instead */ +#if defined(DEBUG) +#define pr_debug_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#else +#define pr_debug_ratelimited(fmt, ...) \ + ({ if (0) printk_ratelimited(KERN_DEBUG pr_fmt(fmt), \ + ##__VA_ARGS__); 0; }) +#endif + +#endif -- cgit v1.2.3 From dc88e46029486ed475c71fe1bb696d39511ac8fe Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 23 Nov 2010 17:50:31 -0500 Subject: lib: hex2bin converts ascii hexadecimal string to binary Similar to the kgdb_hex2mem() code, hex2bin converts a string to binary using the hex_to_bin() library call. Changelog: - Replace parameter names with src/dst (based on David Howell's comment) - Add 'const' where needed (based on David Howell's comment) - Replace int with size_t (based on David Howell's comment) Signed-off-by: Mimi Zohar Acked-by: Serge E. Hallyn Acked-by: David Howells Signed-off-by: James Morris --- include/linux/kernel.h | 1 + lib/hexdump.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a35b4f7332f0..d0fbc043de60 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -265,6 +265,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte) } extern int hex_to_bin(char ch); +extern void hex2bin(u8 *dst, const char *src, size_t count); /* * General tracing related utility functions - trace_printk(), diff --git a/lib/hexdump.c b/lib/hexdump.c index 5d7a4802c562..b66b2bd67952 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -33,6 +33,22 @@ int hex_to_bin(char ch) } EXPORT_SYMBOL(hex_to_bin); +/** + * hex2bin - convert an ascii hexadecimal string to its binary representation + * @dst: binary result + * @src: ascii hexadecimal string + * @count: result length + */ +void hex2bin(u8 *dst, const char *src, size_t count) +{ + while (count--) { + *dst = hex_to_bin(*src++) << 4; + *dst += hex_to_bin(*src++); + dst++; + } +} +EXPORT_SYMBOL(hex2bin); + /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump -- cgit v1.2.3 From 71a9048448de302d1e968f336de01060d02fae71 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Jan 2011 16:59:35 -0800 Subject: include/linux/kernel.h: abs(): fix handling of 32-bit unsigneds on 64-bit Michal reports: In the framebuffer subsystem the abs() macro is often used as a part of the calculation of a Manhattan metric, which in turn is used as a measure of similarity between video modes. The arguments of abs() are sometimes unsigned numbers. This worked fine until commit a49c59c0 ("Make sure the value in abs() does not get truncated if it is greater than 2^32:) , which changed the definition of abs() to prevent truncation. As a result of this change, in the following piece of code: u32 a = 0, b = 1; u32 c = abs(a - b); 'c' will end up with a value of 0xffffffff instead of the expected 0x1. A problem caused by this change and visible by the end user is that framebuffer drivers relying on functions from modedb.c will fail to find high resolution video modes similar to that explicitly requested by the user if an exact match cannot be found (see e.g. Fix this by special-casing `long' types within abs(). This patch reduces x86_64 code size a bit - drivers/video/uvesafb.o shrunk by 15 bytes, presumably because it is doing abs() on 4-byte quantities, and expanding those to 8-byte longs adds code. testcase: #define oldabs(x) ({ \ long __x = (x); \ (__x < 0) ? -__x : __x; \ }) #define newabs(x) ({ \ long ret; \ if (sizeof(x) == sizeof(long)) { \ long __x = (x); \ ret = (__x < 0) ? -__x : __x; \ } else { \ int __x = (x); \ ret = (__x < 0) ? -__x : __x; \ } \ ret; \ }) typedef unsigned int u32; main() { u32 a = 0; u32 b = 1; u32 oldc = oldabs(a - b); u32 newc = newabs(a - b); printf("%u %u\n", oldc, newc); } akpm:/home/akpm> gcc t.c akpm:/home/akpm> ./a.out 4294967295 1 Reported-by: Michal Januszewski Cc: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d0fbc043de60..57dac7022b63 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -143,9 +143,22 @@ extern int _cond_resched(void); #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) -#define abs(x) ({ \ - long __x = (x); \ - (__x < 0) ? -__x : __x; \ +/* + * abs() handles unsigned and signed longs, ints, shorts and chars. For all + * input types abs() returns a signed long. + * abs() should not be used for 64-bit types (s64, u64, long long) - use abs64() + * for those. + */ +#define abs(x) ({ \ + long ret; \ + if (sizeof(x) == sizeof(long)) { \ + long __x = (x); \ + ret = (__x < 0) ? -__x : __x; \ + } else { \ + int __x = (x); \ + ret = (__x < 0) ? -__x : __x; \ + } \ + ret; \ }) #define abs64(x) ({ \ -- cgit v1.2.3 From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:56 -0800 Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 14 +++++++ include/linux/kernel.h | 7 ++++ mm/compaction.c | 89 ++++++++++++++++++++++++--------------- mm/migrate.c | 17 ++++++++ mm/page_alloc.c | 16 +++++++ mm/vmscan.c | 102 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 196 insertions(+), 49 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac51552d908..2592883d862d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask) +{ + return 0; +} + static inline void defer_compaction(struct zone *zone) { } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac7022b63..5a9d9059520b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/mm/compaction.c b/mm/compaction.c index 20011a850fef..8fe917ec7c11 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, +unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask) { struct compact_control cc = { @@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - status = compact_zone_order(zone, order, gfp_mask); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a7045..94875b265928 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22a1bb7723e4..03a66a31bfcd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int migratetype, unsigned long *did_some_progress) { struct page *page; + struct task_struct *tsk = current; if (!order || compaction_deferred(preferred_zone)) return NULL; + tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask); + tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -2121,6 +2124,19 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; } nopage: diff --git a/mm/vmscan.c b/mm/vmscan.c index 3464312bde07..10ebd74a423c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -59,12 +60,15 @@ * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference * page from the LRU and reclaim all pages within a * naturally aligned range + * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone */ typedef unsigned __bitwise__ lumpy_mode; #define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) #define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) #define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) #define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) +#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) - return; + if (COMPACTION_BUILD) + sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ - sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; if (sc->order > PAGE_ALLOC_COSTLY_ORDER) sc->lumpy_reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) @@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1814,6 +1820,57 @@ out: } } +/* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION)) + return false; + + /* + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + unsigned long nr_scanned = sc->nr_scanned; +restart: + nr_reclaimed = 0; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -2307,6 +2371,14 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + /* + * Compact the zone for higher orders to reduce + * latencies for higher-order allocations that + * would ordinarily call try_to_compact_pages() + */ + if (sc.order > PAGE_ALLOC_COSTLY_ORDER) + compact_zone_order(zone, sc.order, sc.gfp_mask); + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit v1.2.3 From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:06:35 +0100 Subject: lockdep: Move early boot local IRQ enable/disable status to init/main.c During early boot, local IRQ is disabled until IRQ subsystem is properly initialized. During this time, no one should enable local IRQ and some operations which usually are not allowed with IRQ disabled, e.g. operations which might sleep or require communications with other processors, are allowed. lockdep tracked this with early_boot_irqs_off/on() callbacks. As other subsystems need this information too, move it to init/main.c and make it generally available. While at it, toggle the boolean to early_boot_irqs_disabled instead of enabled so that it can be initialized with %false and %true indicates the exceptional condition. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110635.GB6036@htj.dyndns.org> Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- include/linux/kernel.h | 2 ++ include/linux/lockdep.h | 8 -------- init/main.c | 13 +++++++++++-- kernel/lockdep.c | 18 +----------------- kernel/trace/trace_irqsoff.c | 8 -------- 6 files changed, 15 insertions(+), 36 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7e8d3bc80af6..50542efe45fb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1194,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; local_irq_disable(); - early_boot_irqs_off(); + early_boot_irqs_disabled = true; memblock_init(); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5a9d9059520b..d07d8057e440 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -243,6 +243,8 @@ extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags; +extern bool early_boot_irqs_disabled; + /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 71c09b26c759..f638fd78d106 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -436,16 +436,8 @@ do { \ #endif /* CONFIG_LOCKDEP */ #ifdef CONFIG_TRACE_IRQFLAGS -extern void early_boot_irqs_off(void); -extern void early_boot_irqs_on(void); extern void print_irqtrace_events(struct task_struct *curr); #else -static inline void early_boot_irqs_off(void) -{ -} -static inline void early_boot_irqs_on(void) -{ -} static inline void print_irqtrace_events(struct task_struct *curr) { } diff --git a/init/main.c b/init/main.c index 00799c1d4628..33c37c379e96 100644 --- a/init/main.c +++ b/init/main.c @@ -96,6 +96,15 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif +/* + * Debug helper: via this flag we know that we are in 'early bootup code' + * where only the boot processor is running with IRQ disabled. This means + * two things - IRQ must not be enabled before the flag is cleared and some + * operations which are not allowed with IRQ disabled are allowed while the + * flag is set. + */ +bool early_boot_irqs_disabled __read_mostly; + enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); @@ -554,7 +563,7 @@ asmlinkage void __init start_kernel(void) cgroup_init_early(); local_irq_disable(); - early_boot_irqs_off(); + early_boot_irqs_disabled = true; /* * Interrupts are still disabled. Do necessary setups, then @@ -621,7 +630,7 @@ asmlinkage void __init start_kernel(void) if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " "enabled early\n"); - early_boot_irqs_on(); + early_boot_irqs_disabled = false; local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d9..0d2058da80f5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2291,22 +2291,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) return 1; } -/* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - /* * Hardirqs will be enabled: */ @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; if (unlikely(curr->hardirqs_enabled)) { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b880..92b6e1e12d98 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) * Stubs: */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - void trace_softirqs_on(unsigned long ip) { } -- cgit v1.2.3 From 7ef88ad561457c0346355dfd1f53e503ddfde719 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 24 Jan 2011 14:45:10 -0600 Subject: BUILD_BUG_ON: make it handle more cases BUILD_BUG_ON used to use the optimizer to do code elimination or fail at link time; it was changed to first the size of a negative array (a nicer compile time error), then (in 8c87df457cb58fe75b9b893007917cf8095660a0) to a bitfield. This forced us to change some non-constant cases to MAYBE_BUILD_BUG_ON(); as Jan points out in that commit, it didn't work as intended anyway. bitfields: needs a literal constant at parse time, and can't be put under "if (__builtin_constant_p(x))" for example. negative array: can handle anything, but if the compiler can't tell it's a constant, silently has no effect. link time: breaks link if the compiler can't determine the value, but the linker output is not usually as informative as a compiler error. If we use the negative-array-size method *and* the link time trick, we get the ability to use BUILD_BUG_ON() under __builtin_constant_p() branches, and maximal ability for the compiler to detect errors at build time. We also document it thoroughly. Signed-off-by: Rusty Russell Cc: Jan Beulich Acked-by: Hollis Blanchard --- include/linux/kernel.h | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d07d8057e440..864712f3653d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -575,12 +575,6 @@ struct sysinfo { char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */ }; -/* Force a compilation error if condition is true */ -#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) - -/* Force a compilation error if condition is constant and true */ -#define MAYBE_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)])) - /* Force a compilation error if a constant expression is not a power of 2 */ #define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) @@ -592,6 +586,33 @@ struct sysinfo { #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) +/** + * BUILD_BUG_ON - break compile if a condition is true. + * @cond: the condition which the compiler should know is false. + * + * If you have some code which relies on certain constants being equal, or + * other compile-time-evaluated condition, you should use BUILD_BUG_ON to + * detect if someone changes it. + * + * The implementation uses gcc's reluctance to create a negative array, but + * gcc (as of 4.4) only emits that error for obvious cases (eg. not arguments + * to inline functions). So as a fallback we use the optimizer; if it can't + * prove the condition is false, it will cause a link error on the undefined + * "__build_bug_on_failed". This error message can be harder to track down + * though, hence the two different methods. + */ +#ifndef __OPTIMIZE__ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#else +extern int __build_bug_on_failed; +#define BUILD_BUG_ON(condition) \ + do { \ + ((void)sizeof(char[1 - 2*!!(condition)])); \ + if (condition) __build_bug_on_failed = 1; \ + } while(0) +#endif +#define MAYBE_BUILD_BUG_ON(condition) BUILD_BUG_ON(condition) + /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) -- cgit v1.2.3 From 1765e3a4933ea0870fabd755feffc5473c4363ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 24 Jan 2011 14:45:10 -0600 Subject: Remove MAYBE_BUILD_BUG_ON Now BUILD_BUG_ON() can handle optimizable constants, we don't need MAYBE_BUILD_BUG_ON any more. Signed-off-by: Rusty Russell --- include/linux/gfp.h | 2 +- include/linux/kernel.h | 1 - include/linux/kmemcheck.h | 2 +- include/linux/virtio_config.h | 5 ++++- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a3b148a91874..0b84c61607e8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -249,7 +249,7 @@ static inline enum zone_type gfp_zone(gfp_t flags) ((1 << ZONES_SHIFT) - 1); if (__builtin_constant_p(bit)) - MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); else { #ifdef CONFIG_DEBUG_VM BUG_ON((GFP_ZONE_BAD >> bit) & 1); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 864712f3653d..e2f4d6af2125 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -611,7 +611,6 @@ extern int __build_bug_on_failed; if (condition) __build_bug_on_failed = 1; \ } while(0) #endif -#define MAYBE_BUILD_BUG_ON(condition) BUILD_BUG_ON(condition) /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 08d7dc4ddf40..39f8453239f7 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -76,7 +76,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); \ _n = (long) &((ptr)->name##_end) \ - (long) &((ptr)->name##_begin); \ - MAYBE_BUILD_BUG_ON(_n < 0); \ + BUILD_BUG_ON(_n < 0); \ \ kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ } while (0) diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 0093dd7c1d6f..800617b4ddd5 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -109,7 +109,10 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, unsigned int fbit) { /* Did you forget to fix assumptions on max features? */ - MAYBE_BUILD_BUG_ON(fbit >= 32); + if (__builtin_constant_p(fbit)) + BUILD_BUG_ON(fbit >= 32); + else + BUG_ON(fbit >= 32); if (fbit < VIRTIO_TRANSPORT_F_START) virtio_check_driver_offered_feature(vdev, fbit); -- cgit v1.2.3 From ffbbf2da9e578dc7b7ae4f945412c4b74f54b20e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 31 Jan 2011 09:29:14 -0800 Subject: kernel.h: fix kernel-doc warning Fix kernel-doc warning in kernel.h from commit 7ef88ad56145 ("BUILD_BUG_ON: make it handle more cases"): Warning(include/linux/kernel.h:605): No description found for parameter 'condition' Warning(include/linux/kernel.h:605): Excess function parameter 'cond' description in 'BUILD_BUG_ON' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/kernel.h') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e2f4d6af2125..2fe6e84894a4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -588,7 +588,7 @@ struct sysinfo { /** * BUILD_BUG_ON - break compile if a condition is true. - * @cond: the condition which the compiler should know is false. + * @condition: the condition which the compiler should know is false. * * If you have some code which relies on certain constants being equal, or * other compile-time-evaluated condition, you should use BUILD_BUG_ON to -- cgit v1.2.3