From 510d82c6621cb8a0897717f58a3b24ac6aa64e85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:36:47 +1000 Subject: shm-fix-null-pointer-deref-when-userspace-specifies-invalid-hugepage-size-fix eliminate ugly 80-col tricks Cc: Dave Jones Cc: Davidlohr Bueso Cc: Li Zefan Cc: Li Zefan Cc: Naoya Horiguchi Cc: Rik van Riel Signed-off-by: Andrew Morton --- ipc/shm.c | 4 ++-- mm/mmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 7e199fa1960f..85dc001634b1 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -491,10 +491,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf (name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) - & SHM_HUGE_MASK); + struct hstate *hs; size_t hugesize; + hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) { error = -EINVAL; goto no_file; diff --git a/mm/mmap.c b/mm/mmap.c index f681e1842fad..ec75bb0681f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1367,9 +1367,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; - struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & - SHM_HUGE_MASK); + struct hstate *hs; + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) return -EINVAL; -- cgit v1.2.3 From 0cf5ed3d76b070392903e0b61fa19b4697b62d53 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 23 May 2013 10:36:51 +1000 Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug There are instances in the kernel where we would like to disable CPU hotplug (from sysfs) during some important operation. Today the freezer code depends on this and the code to do it was kinda tailor-made for that. Restructure the code and make it generic enough to be useful for other usecases too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Shawn Guo Cc: Signed-off-by: Andrew Morton --- include/linux/cpu.h | 14 ++++++++++++++ kernel/cpu.c | 27 +++++++++------------------ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 0d868d37bb8e..0bf660c5e6a7 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -167,6 +167,20 @@ static inline void cpu_maps_update_done(void) } #endif /* CONFIG_SMP */ + +#ifdef CONFIG_PM_SLEEP_SMP +extern void cpu_hotplug_enable(void); +extern void cpu_hotplug_disable(void); +#else +static inline void cpu_hotplug_enable(void) +{ +} + +static inline void cpu_hotplug_disable(void) +{ +} +#endif + extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..28769f5ca9a4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -541,29 +541,20 @@ static int __init alloc_frozen_cpus(void) core_initcall(alloc_frozen_cpus); /* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. */ -void cpu_hotplug_disable_before_freeze(void) +void cpu_hotplug_disable(void) { cpu_maps_update_begin(); cpu_hotplug_disabled = 1; cpu_maps_update_done(); } - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) +void cpu_hotplug_enable(void) { cpu_maps_update_begin(); cpu_hotplug_disabled = 0; @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: -- cgit v1.2.3 From d3cd512ee0dac26677dc60d0e71e9673defce03f Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 23 May 2013 10:36:52 +1000 Subject: cpu-hotplug-provide-a-generic-helper-to-disable-enable-cpu-hotplug-v11 Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Signed-off-by: Andrew Morton --- include/linux/cpu.h | 18 ++++-------------- kernel/cpu.c | 42 +++++++++++++++++++++--------------------- 2 files changed, 25 insertions(+), 35 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 0bf660c5e6a7..944f283f01c4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -167,20 +167,6 @@ static inline void cpu_maps_update_done(void) } #endif /* CONFIG_SMP */ - -#ifdef CONFIG_PM_SLEEP_SMP -extern void cpu_hotplug_enable(void); -extern void cpu_hotplug_disable(void); -#else -static inline void cpu_hotplug_enable(void) -{ -} - -static inline void cpu_hotplug_disable(void) -{ -} -#endif - extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU @@ -188,6 +174,8 @@ extern struct bus_type cpu_subsys; extern void get_online_cpus(void); extern void put_online_cpus(void); +extern void cpu_hotplug_disable(void); +extern void cpu_hotplug_enable(void); #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri) #define register_hotcpu_notifier(nb) register_cpu_notifier(nb) #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) @@ -211,6 +199,8 @@ static inline void cpu_hotplug_driver_unlock(void) #define get_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0) +#define cpu_hotplug_disable() do { } while (0) +#define cpu_hotplug_enable() do { } while (0) #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) /* These aren't inline functions due to a GCC bug. */ #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) diff --git a/kernel/cpu.c b/kernel/cpu.c index 28769f5ca9a4..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -540,27 +561,6 @@ static int __init alloc_frozen_cpus(void) } core_initcall(alloc_frozen_cpus); -/* - * Wait for currently running CPU hotplug operations to complete (if any) and - * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects - * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the - * hotplug path before performing hotplug operations. So acquiring that lock - * guarantees mutual exclusion from any currently running hotplug operations. - */ -void cpu_hotplug_disable(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - -void cpu_hotplug_enable(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen -- cgit v1.2.3 From 62e930e9beb83a7cff5854fb26bdf82e8676e41d Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Thu, 23 May 2013 10:36:52 +1000 Subject: reboot: rigrate shutdown/reboot to boot cpu We recently noticed that reboot of a 1024 cpu machine takes approx 16 minutes of just stopping the cpus. The slowdown was tracked to commit f96972f. The current implementation does all the work of hot removing the cpus before halting the system. We are switching to just migrating to the boot cpu and then continuing with shutdown/reboot. This also has the effect of not breaking x86's command line parameter for specifying the reboot cpu. Note, this code was shamelessly copied from arch/x86/kernel/reboot.c with bits removed pertaining to the reboot_cpu command line parameter. Signed-off-by: Robin Holt Tested-by: Shawn Guo Cc: "Srivatsa S. Bhat" Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..14640d72ba1e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int reboot_cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(reboot_cpu)) + reboot_cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); -- cgit v1.2.3 From ce4eb71715768029c11721921b50d5d33886a6a5 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Thu, 23 May 2013 10:36:52 +1000 Subject: migrate-shutdown-reboot-to-boot-cpu-v11 Signed-off-by: Robin Holt Signed-off-by: Andrew Morton --- kernel/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 14640d72ba1e..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -370,19 +370,19 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ - int reboot_cpu = 0; + int cpu = 0; cpu_hotplug_disable(); /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu)) - reboot_cpu = cpumask_first(cpu_online_mask); + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); /* Prevent races with other tasks migrating this task */ current->flags |= PF_NO_SETAFFINITY; /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu)); + set_cpus_allowed_ptr(current, cpumask_of(cpu)); } /** -- cgit v1.2.3 From cfa98d4f83a75d9bb820ab660ab703ad39ec9f4f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 May 2013 10:36:53 +1000 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows:  - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ).  - everything, after an open. - syslog syscall allows:  - anything, if CAP_SYSLOG.  - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0.  - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows:  - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton --- fs/proc/kmsg.c | 10 +++--- include/linux/syslog.h | 4 +-- kernel/printk.c | 91 +++++++++++++++++++++++++++----------------------- 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 38911391a139..98a3153c0f96 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -44,8 +44,8 @@ /* Return size of the log buffer */ #define SYSLOG_ACTION_SIZE_BUFFER 10 -#define SYSLOG_FROM_CALL 0 -#define SYSLOG_FROM_FILE 1 +#define SYSLOG_FROM_READER 0 +#define SYSLOG_FROM_PROC 1 int do_syslog(int type, char __user *buf, int count, bool from_file); diff --git a/kernel/printk.c b/kernel/printk.c index 7b254c33f672..331bd1b6e9ce 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + printk_once(KERN_WARNING "%s (%d): " + "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* -- cgit v1.2.3 From 195c69bbc95b1c79100c7ef2d07f2f8a17ba39cb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:36:53 +1000 Subject: kmsg-honor-dmesg_restrict-sysctl-on-dev-kmsg-fix use pr_warn_once() Cc: Josh Boyer Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 331bd1b6e9ce..0a4095e2fcda 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -398,9 +398,9 @@ static int check_syslog_permissions(int type, bool from_file) * a warning. */ if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", current->comm, task_pid_nr(current)); return 0; } -- cgit v1.2.3 From 3108b515752c1b1871abac899870058a1e605304 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 23 May 2013 10:36:53 +1000 Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from kill_rules() audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect the rule itself freed in kill_rules(). The reason is when it is killed, the 'rule' itself may have already released, we should not access it. one example: we add a rule to an inode, just at the same time the other task is deleting this inode. The work flow for adding a rule: audit_receive() -> (need audit_cmd_mutex lock) audit_receive_skb() -> audit_receive_msg() -> audit_receive_filter() -> audit_add_rule() -> audit_add_tree_rule() -> (need audit_filter_mutex lock) ... unlock audit_filter_mutex get_tree() ... iterate_mounts() -> (iterate all related inodes) tag_mount() -> tag_trunk() -> create_trunk() -> (assume it is 1st rule) fsnotify_add_mark() -> fsnotify_add_inode_mark() -> (add mark to inode->i_fsnotify_marks) ... get_tree(); (each inode will get one) ... lock audit_filter_mutex The work flow for deleting an inode: __destroy_inode() -> fsnotify_inode_delete() -> __fsnotify_inode_delete() -> fsnotify_clear_marks_by_inode() -> (get mark from inode->i_fsnotify_marks) fsnotify_destroy_mark() -> fsnotify_destroy_mark_locked() -> audit_tree_freeing_mark() -> evict_chunk() -> ... tree->goner = 1 ... kill_rules() -> (assume current->audit_context == NULL) call_rcu() -> (rule->tree != NULL) audit_free_rule_rcu() -> audit_free_rule() ... audit_schedule_prune() -> (assume current->audit_context == NULL) kthread_run() -> (need audit_cmd_mutex and audit_filter_mutex lock) prune_one() -> (delete it from prue_list) put_tree(); (match the original get_tree above) Signed-off-by: Chen Gang Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton --- kernel/audit_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); -- cgit v1.2.3 From 80df565b465290f331462541127805761d16ae2e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:36:54 +1000 Subject: sound/soc/codecs/si476x.c: don't use 0bNNN spacr64 gcc-3.4.5 (at least) spits this back. Cc: Andrey Smirnov Cc: Mark Brown Cc: Takashi Iwai Signed-off-by: Andrew Morton --- sound/soc/codecs/si476x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/si476x.c b/sound/soc/codecs/si476x.c index 721587c9cd84..73e205c892a0 100644 --- a/sound/soc/codecs/si476x.c +++ b/sound/soc/codecs/si476x.c @@ -38,9 +38,9 @@ enum si476x_digital_io_output_format { SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT = 8, }; -#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0b111 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \ - (0b111 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT)) -#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0b1111110) +#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0x7 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \ + (0x7 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT)) +#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0x7e) enum si476x_daudio_formats { SI476X_DAUDIO_MODE_I2S = (0x0 << 1), -- cgit v1.2.3 From 9543398ef82d7819aa5947b626d4e699896f7cab Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Thu, 23 May 2013 10:36:55 +1000 Subject: x86: make 'mem=' option to work for efi platform Current mem boot option only can work for non efi environment. If the user specifies add_efi_memmap, it cannot work for efi environment. In the efi environment, we call e820_add_region() to add the memory map. So we can modify __e820_add_region() and the mem boot option can work for efi environment. Note: Only E820_RAM is limited, and BOOT_SERVICES_{CODE,DATA} are always mapped(If its address >= mem_limit, the memory won't be freed in efi_free_boot_services()). Signed-off-by: Wen Congyang Cc: Matt Fleming Cc: Rob Landley Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Bjorn Helgaas Cc: "H. Peter Anvin" Cc: Yasuaki ISIMATU Cc: KOSAKI Motohiro Cc: Matthew Garrett Signed-off-by: Andrew Morton --- arch/x86/include/asm/e820.h | 2 ++ arch/x86/kernel/e820.c | 72 +++++++++++++++++++++++++++++++++++++++------ arch/x86/platform/efi/efi.c | 15 ++++++++-- 3 files changed, 77 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3a..b8e9224f0b45 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -17,6 +17,8 @@ extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void e820_add_region(u64 start, u64 size, int type); +extern void e820_add_limit_region(u64 start, u64 size, int type); +extern void e820_adjust_region(u64 *start, u64 *size); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda5..0d5bb689649a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +static u64 mem_limit = ~0ULL; /* * This function checks if any part of the range is mapped @@ -108,7 +109,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) * Add a memory region to the kernel e820 map. */ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) + int type, bool limited) { int x = e820x->nr_map; @@ -119,6 +120,22 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, return; } + if (limited) { + if (start >= mem_limit) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)start, + (unsigned long long)(start + size - 1)); + return; + } + + if (mem_limit - start < size) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)mem_limit, + (unsigned long long)(start + size - 1)); + size = mem_limit - start; + } + } + e820x->map[x].addr = start; e820x->map[x].size = size; e820x->map[x].type = type; @@ -127,7 +144,37 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(&e820, start, size, type, false); +} + +/* + * do_add_efi_memmap() calls this function(). + * + * Note: BOOT_SERVICES_{CODE,DATA} regions on some efi machines are marked + * as E820_RAM, and they are needed to be mapped. Please use e820_add_region() + * to add BOOT_SERVICES_{CODE,DATA} regions. + */ +void __init e820_add_limit_region(u64 start, u64 size, int type) +{ + /* + * efi_init() is called after finish_e820_parsing(), so we should + * check whether [start, start + size) contains address above + * mem_limit if the type is E820_RAM. + */ + __e820_add_region(&e820, start, size, type, type == E820_RAM); +} + +void __init e820_adjust_region(u64 *start, u64 *size) +{ + if (*start >= mem_limit) { + *size = 0; + return; + } + + if (mem_limit - *start < *size) + *size = mem_limit - *start; + + return; } static void __init e820_print_type(u32 type) @@ -455,8 +502,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, /* new range is totally covered? */ if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); + __e820_add_region(e820x, start, size, new_type, false); + __e820_add_region(e820x, end, ei_end - end, ei->type, + false); ei->size = start - ei->addr; real_updated_size += size; continue; @@ -469,7 +517,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, continue; __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + new_type, false); real_updated_size += final_end - final_start; @@ -809,7 +857,7 @@ static int userdef __initdata; /* "mem=nopentium" disables the 4MB page tables. */ static int __init parse_memopt(char *p) { - u64 mem_size; + char *oldp; if (!p) return -EINVAL; @@ -825,11 +873,11 @@ static int __init parse_memopt(char *p) } userdef = 1; - mem_size = memparse(p, &p); + oldp = p; + mem_limit = memparse(p, &p); /* don't remove all of memory when handling "mem={invalid}" param */ - if (mem_size == 0) + if (mem_limit == 0 || p == oldp) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; } @@ -895,6 +943,12 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { + if (mem_limit != ~0ULL) { + userdef = 1; + e820_remove_range(mem_limit, ULLONG_MAX - mem_limit, + E820_RAM, 1); + } + if (userdef) { u32 nr = e820.nr_map; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 82089d8b1954..e7c70c2eaa27 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -419,10 +419,17 @@ static void __init do_add_efi_memmap(void) int e820_type; switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: + /* EFI_BOOT_SERVICES_{CODE,DATA} needs to be mapped */ + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + e820_add_region(start, size, e820_type); + continue; + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: case EFI_CONVENTIONAL_MEMORY: if (md->attribute & EFI_MEMORY_WB) e820_type = E820_RAM; @@ -447,7 +454,7 @@ static void __init do_add_efi_memmap(void) e820_type = E820_RESERVED; break; } - e820_add_region(start, size, e820_type); + e820_add_limit_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -555,6 +562,8 @@ void __init efi_free_boot_services(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + e820_adjust_region(&start, &size); + /* Could not reserve boot area */ if (!size) continue; -- cgit v1.2.3 From 1f7c27ee536b992694778499336fced920040342 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 May 2013 10:36:55 +1000 Subject: audit: fix mq_open and mq_unlink to add the MQ root as a hidden parent audit_names record The old audit PATH records for mq_open looked like this: type=PATH msg=audit(1366282323.982:869): item=1 name=(null) inode=6777 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282323.982:869): item=0 name="test_mq" inode=26732 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 ...with the audit related changes that went into 3.7, they now look like this: type=PATH msg=audit(1366282236.776:3606): item=2 name=(null) inode=66655 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=1 name=(null) inode=6926 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=0 name="test_mq" Both of these look wrong to me. As Steve Grubb pointed out: "What we need is 1 PATH record that identifies the MQ. The other PATH records probably should not be there." Fix it to record the mq root as a parent, and flag it such that it should be hidden from view when the names are logged, since the root of the mq filesystem isn't terribly interesting. With this change, we get a single PATH record that looks more like this: type=PATH msg=audit(1368021604.836:484): item=0 name="test_mq" inode=16914 dev=00:0c mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmpfs_t:s0 In order to do this, a new audit_inode_parent_hidden() function is added. If we do it this way, then we avoid having the existing callers of audit_inode needing to do any sort of flag conversion if auditing is inactive. Signed-off-by: Jeff Layton Reported-by: Jiri Jaburek Cc: Steve Grubb Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton --- include/linux/audit.h | 26 ++++++++++++++++++++++---- ipc/mqueue.c | 2 ++ kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index b20b03852f21..729a4d165bcc 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -103,8 +103,11 @@ extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void audit_putname(struct filename *name); + +#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ +#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ extern void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent); + unsigned int flags); extern void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type); @@ -148,10 +151,22 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } -static inline void audit_inode(struct filename *name, const struct dentry *dentry, +static inline void audit_inode(struct filename *name, + const struct dentry *dentry, unsigned int parent) { + if (unlikely(!audit_dummy_context())) { + unsigned int flags = 0; + if (parent) + flags |= AUDIT_INODE_PARENT; + __audit_inode(name, dentry, flags); + } +} +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ if (unlikely(!audit_dummy_context())) - __audit_inode(name, dentry, parent); + __audit_inode(name, dentry, + AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -311,7 +326,7 @@ static inline void audit_putname(struct filename *name) { } static inline void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { } static inline void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -321,6 +336,9 @@ static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int parent) { } +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e4e47f647446..ae1996d3c539 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -823,6 +823,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, error = ro; goto out; } + audit_inode_parent_hidden(name, root); filp = do_create(ipc_ns, root->d_inode, &path, oflag, mode, u_attr ? &attr : NULL); @@ -868,6 +869,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) if (IS_ERR(name)) return PTR_ERR(name); + audit_inode_parent_hidden(name, mnt->mnt_root); err = mnt_want_write(mnt); if (err) goto out_name; diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; -- cgit v1.2.3 From b7c82457c27fb5103e0c43d8e0c470ed419e6153 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 23 May 2013 10:36:55 +1000 Subject: kernel/auditfilter.c: fix leak in audit_add_rule() error path If both 'tree' and 'watch' are valid we must call audit_put_tree(), just like the preceding code within audit_add_rule(). Signed-off-by: Chen Gang Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton --- kernel/auditfilter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..ea8550fb7585 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } -- cgit v1.2.3 From d750207c649f4d72a747d036b89a091740cece44 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 23 May 2013 10:36:56 +1000 Subject: drm/fb-helper: don't sleep for screen unblank when an oops is in progress Otherwise the system will burn even brighter and worse, leave the user wondering what's going on exactly. Since we already have a panic handler which will (try) to restore the entire fbdev console mode, we can just bail out. Inspired by a patch from Konstantin Khlebnikov. The callchain leading to this, cut&pasted from Konstantin's original patch: callstack: panic() bust_spinlocks(1) unblank_screen() vc->vc_sw->con_blank() fbcon_blank() fb_blank() info->fbops->fb_blank() drm_fb_helper_blank() drm_fb_helper_dpms() drm_modeset_lock_all() mutex_lock(&dev->mode_config.mutex) Note that the entire locking in the fb helper around panic/sysrq and kdbg is ... non-existant. So we have a decent change of blowing up everything. But since reworking this ties in with funny concepts like the fbdev notifier chain or the impressive things which happen around console_lock while oopsing, I'll leave that as an exercise for braver souls than me. Signed-off-by: Daniel Vetter Cc: Konstantin Khlebnikov Cc: Dave Airlie Reviewed-by: Rob Clark Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_fb_helper.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index b78cbe74dadf..442a15443fa5 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -390,6 +390,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode) struct drm_connector *connector; int i, j; + /* + * fbdev->blank can be called from irq context in case of a panic. + * Since we already have our own special panic handler which will + * restore the fbdev console mode completely, just bail out early. + */ + if (oops_in_progress) + return; + /* * fbdev->blank can be called from irq context in case of a panic. * Since we already have our own special panic handler which will -- cgit v1.2.3 From de1bc4dff48c3372d1b371855ded3e5396c18270 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:36:56 +1000 Subject: drivers/video/smscufx.c: use NULL instead of 0 'info' is a pointer. Use NULL instead of 0. Signed-off-by: Sachin Kamat Acked-by: Steve Glendinning Signed-off-by: Andrew Morton --- drivers/video/smscufx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c index b2b33fc1ac3f..e188ada2ffd1 100644 --- a/drivers/video/smscufx.c +++ b/drivers/video/smscufx.c @@ -1622,7 +1622,7 @@ static int ufx_usb_probe(struct usb_interface *interface, { struct usb_device *usbdev; struct ufx_data *dev; - struct fb_info *info = 0; + struct fb_info *info = NULL; int retval = -ENOMEM; u32 id_rev, fpga_rev; -- cgit v1.2.3 From c1afb2a2c74cf0c84933ac24dd51b1d3ee5931d8 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:36:57 +1000 Subject: drivers/video/udlfb.c: use NULL instead of 0 Pointer variables should be initialized with NULL instead of 0. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/video/udlfb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index ec03e726c940..20353a67fa56 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c @@ -434,10 +434,10 @@ static void dlfb_compress_hline( while ((pixel_end > pixel) && (cmd_buffer_end - MIN_RLX_CMD_BYTES > cmd)) { - uint8_t *raw_pixels_count_byte = 0; - uint8_t *cmd_pixels_count_byte = 0; - const uint16_t *raw_pixel_start = 0; - const uint16_t *cmd_pixel_start, *cmd_pixel_end = 0; + uint8_t *raw_pixels_count_byte = NULL; + uint8_t *cmd_pixels_count_byte = NULL; + const uint16_t *raw_pixel_start = NULL; + const uint16_t *cmd_pixel_start, *cmd_pixel_end = NULL; prefetchw((void *) cmd); /* pull in one cache line at least */ @@ -1588,7 +1588,7 @@ static int dlfb_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *usbdev; - struct dlfb_data *dev = 0; + struct dlfb_data *dev = NULL; int retval = -ENOMEM; /* usb initialization */ -- cgit v1.2.3 From 6237ef453ec9d1fa297547121ec71df79813ad41 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:36:57 +1000 Subject: drivers/video/udlfb.c: make local symbol static 'dlfb_handle_damage' is used only in this file. Make it static. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/video/udlfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index 20353a67fa56..d2e5bc3cf969 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c @@ -573,7 +573,7 @@ static int dlfb_render_hline(struct dlfb_data *dev, struct urb **urb_ptr, return 0; } -int dlfb_handle_damage(struct dlfb_data *dev, int x, int y, +static int dlfb_handle_damage(struct dlfb_data *dev, int x, int y, int width, int height, char *data) { int i, ret; -- cgit v1.2.3 From 5293bdf74f98b90e5cecd46a70e7650246580c6e Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:36:57 +1000 Subject: drivers/video/imxfb.c: make local symbols static These symbols are used only in this file. Make them static. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/video/imxfb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/imxfb.c b/drivers/video/imxfb.c index a2ed71dfd449..9a1929a8bbb4 100644 --- a/drivers/video/imxfb.c +++ b/drivers/video/imxfb.c @@ -1083,7 +1083,7 @@ static int imxfb_remove(struct platform_device *pdev) return 0; } -void imxfb_shutdown(struct platform_device * dev) +static void imxfb_shutdown(struct platform_device *dev) { struct fb_info *info = platform_get_drvdata(dev); struct imxfb_info *fbi = info->par; @@ -1123,7 +1123,7 @@ static int imxfb_setup(void) return 0; } -int __init imxfb_init(void) +static int __init imxfb_init(void) { int ret = imxfb_setup(); -- cgit v1.2.3 From 1d68f7e8e7b46f719d255f5d71b86e328b8ff6a6 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Thu, 23 May 2013 10:36:58 +1000 Subject: cyber2000fb: avoid palette corruption at higher clocks When 1280x1024@75Hz mode is set, console palette is not set properly - sometimes the background is white, sometimes yellow and text colors are also messed up. This does not happen at 1280x1024@60Hz and below. It seems that the HW needs some time before setting the palette - maybe the PLL needs more time to lock at higher speeds. This patch fixes the problem but without knowing what register to check for PLL lock(?), the delay might be excessive. On Fri, 28 Jan 2011 18:15:37 +0000 Russell King wrote: > On Tue, Jan 18, 2011 at 01:14:24PM -0800, Andrew Morton wrote: > > Russell, I have an (old) note here that this is awaiting an ack from > > yourself? > > Well, I can reproduce this problem on the Netwinders here. I'm not sure > that we should delay all mode switches by one second - and any attempt > to reduce this value does result in the palette not being set correctly. > > For 1280x1024-75, the dotclock is 135MHz, which gives a PLL values of > 0x41 and 0x06. That's: M=0x41+1, N=0x06+1, P=0x00 (top 2 bits of 0x06) > -> Q=1 > > Fpll = 14.31818MHz * M / N > Fout = Fpll / Q > > The PLL itself is formed by dividing the 14-ish MHz frequency by N and > phase comparing the output of the VCO, divided by M, and adjusting the > VCO until the two correlate. As VCOs typically tend to have a limited > range, it's normal to divide the output frequency to produce a greater > range - and in this case that's done by Q. > > For the 800x600-100 copied from /etc/fb.modes, this has a dotclock of > 67.5MHz, which is exactly half this rate. The PLL values for this are: > M=0x41+1, N=0x06+1, P=0x01, giving PLL values of 0x41 and 0x46. > > Booting with 800x600-100 does not suffer the problem. So it's not > related to PLL lock time. There's something else going on. > > Another experiment I tried was forcing the PLL values to produce 108MHz > instead of 135MHz. 108MHz is the dotclock for 1280x1024-60. This too > doesn't suffer the problem. > > I've also tried chosing other delay values. 100ms is too short and > produces the problem, but 1s works. 1s for a PLL to lock is a hell of > a time, especially for a PLL operating in the MHz range. > > I've tried setting the PLL to a known good freqency, and then switching > to 135MHz - the problem persists. It's not like 135MHz is reaching the > limits - it'll go up to 206MHz. > > So, I don't think this has anything to do with PLL locking. I think > there's something else going on which isn't immediately obvious - maybe > bandwidth starvation preventing us from writing properly to the palette? > As it's a horrible VGA, where you write the same register multiple times > I wouldn't be surprised if some writes were going missing. > > I'll see if I can play around with it some more this evening, but I've > spent an awful long time on just this issue already this afternoon... > > I think further investigation needs to happen on this patch before it's > acceptable. Or maybe we should prevent the cyberpro coming up in Signed-off-by: Ondrej Zary Cc: Russell King Signed-off-by: Andrew Morton --- drivers/video/cyber2000fb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 57886787ead0..e78d9f2233b8 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -518,6 +518,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw) cyber2000_grphw(0xb9, 0x00, cfb); spin_unlock(&cfb->reg_b0_lock); + /* wait (for the PLL?) to avoid palette corruption at higher clocks */ + msleep(1000); + cfb->ramdac_ctrl = hw->ramdac; cyber2000fb_write_ramdac_ctrl(cfb); -- cgit v1.2.3 From a41961b3833d2d04d7fd09b4a56d7362917f699d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 May 2013 10:36:58 +1000 Subject: posix_cpu_timer: consolidate expiry time type The posix cpu timer expiry time is stored in a union of two types: a 64 bits field if we rely on scheduler precise accounting, or a cputime_t if we rely on jiffies. This results in quite some duplicate code and special cases to handle the two types. Just unify this into a single 64 bits field. cputime_t can always fit into it. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- include/linux/posix-timers.h | 16 ++- kernel/posix-cpu-timers.c | 266 +++++++++++++++++-------------------------- 2 files changed, 117 insertions(+), 165 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 7794d75ed155..907f3fd191ac 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -7,14 +7,20 @@ #include #include -union cpu_time_count { - cputime_t cpu; - unsigned long long sched; -}; + +static inline unsigned long long cputime_to_expires(cputime_t expires) +{ + return (__force unsigned long long)expires; +} + +static inline cputime_t expires_to_cputime(unsigned long long expires) +{ + return (__force cputime_t)expires; +} struct cpu_timer_list { struct list_head entry; - union cpu_time_count expires, incr; + unsigned long long expires, incr; struct task_struct *task; int firing; }; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c3c4ea1225a4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -461,30 +414,30 @@ static void cleanup_timers(struct list_head *head, list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(ptime)) { + timer->expires = 0; } else { - timer->expires.cpu -= ptime; + timer->expires -= cputime_to_expires(ptime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(utime)) { + timer->expires = 0; } else { - timer->expires.cpu -= utime; + timer->expires -= cputime_to_expires(utime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; + if (timer->expires < sum_exec_runtime) { + timer->expires = 0; } else { - timer->expires.sched -= sum_exec_runtime; + timer->expires -= sum_exec_runtime; } } } @@ -516,7 +469,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *timer, unsigned long long now) { /* * That's all for this thread or process. @@ -524,9 +477,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) */ put_task_struct(timer->it.cpu.task); timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + timer->it.cpu.expires -= now; } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +509,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +527,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +552,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +583,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +592,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +645,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +700,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +715,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +739,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +749,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +773,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +795,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +805,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +837,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +858,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -946,8 +891,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; + if (!--maxfire || prof_ticks(tsk) < t->expires) { + tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -961,8 +906,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; + if (!--maxfire || virt_ticks(tsk) < t->expires) { + tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -976,8 +921,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { + tsk->cputime_expires.sched_exp = t->expires; break; } t->firing = 1; @@ -1030,7 +975,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1068,7 +1014,7 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,8 +1024,8 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = 0; @@ -1087,8 +1033,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; + if (!--maxfire || ptime < tl->expires) { + prof_expires = tl->expires; break; } tl->firing = 1; @@ -1102,8 +1048,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; + if (!--maxfire || utime < tl->expires) { + virt_expires = tl->expires; break; } tl->firing = 1; @@ -1117,8 +1063,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires) { + sched_expires = tl->expires; break; } tl->firing = 1; @@ -1162,8 +1108,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1122,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1151,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1387,7 +1333,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1345,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1405,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. -- cgit v1.2.3 From f97e52a0cbdb16e763f4b681a4e335bdab88cd82 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 May 2013 10:36:58 +1000 Subject: posix_cpu_timers: consolidate timer list cleanups Cleaning up the posix cpu timers on task exit shares some common code among timer list types, most notably the list traversal and expiry time update. Unify this in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 48 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c3c4ea1225a4..b1450cee6d6d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -399,6 +399,21 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (timer->expires < curr) { + timer->expires = 0; + } else { + timer->expires -= curr; + } + } +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -409,37 +424,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(ptime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(ptime); - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(utime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(utime); - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < sum_exec_runtime) { - timer->expires = 0; - } else { - timer->expires -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* -- cgit v1.2.3 From d6700a99e715ac08d63711999ede51cd7a4fd2e7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 May 2013 10:36:59 +1000 Subject: posix_cpu_timers: consolidate expired timers check Consolidate the common code amongst per thread and per process timers list on tick time. List traversal, expiry check and subsequent updates can be shared in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 118 +++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 85 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index b1450cee6d6d..92a4fbf44f86 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -862,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -870,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires) { - tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires) { - tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { - tsk->cputime_expires.sched_exp = t->expires; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1002,7 +990,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; @@ -1017,49 +1004,10 @@ static void check_process_timers(struct task_struct *tsk, utime = cputime_to_expires(cputime.utime); ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires) { - prof_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires) { - virt_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires) { - sched_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. -- cgit v1.2.3 From 3d6112b5ccf0553cfd6866fb96cc09af671621d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 May 2013 10:36:59 +1000 Subject: selftests: add basic posix timers selftests Add some initial basic tests on a few posix timers interface such as setitimer() and timer_settime(). These simply check that expiration happens in a reasonable timeframe after expected elapsed clock time (user time, user + system time, real time, ...). This is helpful for finding basic breakages while hacking on this subsystem. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Andrew Morton --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/timers/Makefile | 8 + tools/testing/selftests/timers/posix_timers.c | 221 ++++++++++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 tools/testing/selftests/timers/Makefile create mode 100644 tools/testing/selftests/timers/posix_timers.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 0a63658065f0..4cb14cae3791 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -6,6 +6,7 @@ TARGETS += memory-hotplug TARGETS += mqueue TARGETS += net TARGETS += ptrace +TARGETS += timers TARGETS += vm all: diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile new file mode 100644 index 000000000000..eb2859f4ad21 --- /dev/null +++ b/tools/testing/selftests/timers/Makefile @@ -0,0 +1,8 @@ +all: + gcc posix_timers.c -o posix_timers -lrt + +run_tests: all + ./posix_timers + +clean: + rm -f ./posix_timers diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c new file mode 100644 index 000000000000..4fa655d68a81 --- /dev/null +++ b/tools/testing/selftests/timers/posix_timers.c @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2013 Red Hat, Inc., Frederic Weisbecker + * + * Licensed under the terms of the GNU GPL License version 2 + * + * Selftests for a few posix timers interface. + * + * Kernel loop code stolen from Steven Rostedt + */ + +#include +#include +#include +#include +#include +#include + +#define DELAY 2 +#define USECS_PER_SEC 1000000 + +static volatile int done; + +/* Busy loop in userspace to elapse ITIMER_VIRTUAL */ +static void user_loop(void) +{ + while (!done); +} + +/* + * Try to spend as much time as possible in kernelspace + * to elapse ITIMER_PROF. + */ +static void kernel_loop(void) +{ + void *addr = sbrk(0); + + while (!done) { + brk(addr + 4096); + brk(addr); + } +} + +/* + * Sleep until ITIMER_REAL expiration. + */ +static void idle_loop(void) +{ + pause(); +} + +static void sig_handler(int nr) +{ + done = 1; +} + +/* + * Check the expected timer expiration matches the GTOD elapsed delta since + * we armed the timer. Keep a 0.5 sec error margin due to various jitter. + */ +static int check_diff(struct timeval start, struct timeval end) +{ + long long diff; + + diff = end.tv_usec - start.tv_usec; + diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; + + if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + printf("Diff too high: %lld..", diff); + return -1; + } + + return 0; +} + +static int check_itimer(int which) +{ + int err; + struct timeval start, end; + struct itimerval val = { + .it_value.tv_sec = DELAY, + }; + + printf("Check itimer "); + + if (which == ITIMER_VIRTUAL) + printf("virtual... "); + else if (which == ITIMER_PROF) + printf("prof... "); + else if (which == ITIMER_REAL) + printf("real... "); + + fflush(stdout); + + done = 0; + + if (which == ITIMER_VIRTUAL) + signal(SIGVTALRM, sig_handler); + else if (which == ITIMER_PROF) + signal(SIGPROF, sig_handler); + else if (which == ITIMER_REAL) + signal(SIGALRM, sig_handler); + + err = gettimeofday(&start, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + err = setitimer(which, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + if (which == ITIMER_VIRTUAL) + user_loop(); + else if (which == ITIMER_PROF) + kernel_loop(); + else if (which == ITIMER_REAL) + idle_loop(); + + gettimeofday(&end, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + if (!check_diff(start, end)) + printf("[OK]\n"); + else + printf("[FAIL]\n"); + + return 0; +} + +static int check_timer_create(int which) +{ + int err; + timer_t id; + struct timeval start, end; + struct itimerspec val = { + .it_value.tv_sec = DELAY, + }; + + printf("Check timer_create() "); + if (which == CLOCK_THREAD_CPUTIME_ID) { + printf("per thread... "); + } else if (which == CLOCK_PROCESS_CPUTIME_ID) { + printf("per process... "); + } + fflush(stdout); + + done = 0; + timer_create(which, NULL, &id); + if (err < 0) { + perror("Can't create timer\n"); + return -1; + } + signal(SIGALRM, sig_handler); + + err = gettimeofday(&start, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + err = timer_settime(id, 0, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + user_loop(); + + gettimeofday(&end, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + if (!check_diff(start, end)) + printf("[OK]\n"); + else + printf("[FAIL]\n"); + + return 0; +} + +int main(int argc, char **argv) +{ + int err; + + printf("Testing posix timers. False negative may happen on CPU execution \n"); + printf("based timers if other threads run on the CPU...\n"); + + if (check_itimer(ITIMER_VIRTUAL) < 0) + return -1; + + if (check_itimer(ITIMER_PROF) < 0) + return -1; + + if (check_itimer(ITIMER_REAL) < 0) + return -1; + + if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) + return -1; + + /* + * It's unfortunately hard to reliably test a timer expiration + * on parallel multithread cputime. We could arm it to expire + * on DELAY * nr_threads, with nr_threads busy looping, then wait + * the normal DELAY since the time is elapsing nr_threads faster. + * But for that we need to ensure we have real physical free CPUs + * to ensure true parallelism. So test only one thread until we + * find a better solution. + */ + if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) + return -1; + + return 0; +} -- cgit v1.2.3 From 664425336d2773ac83c4f3a09c4fa609e5abf684 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 May 2013 10:36:59 +1000 Subject: posix-timers: correctly get dying task time sample in posix_cpu_timer_schedule() In order to re-arm a timer after it fired, we take a sample of the current process or thread cputime. If the task is dying though, we don't arm anything but we cache the remaining timer expiration delta for further reads. Something similar is performed in posix_cpu_timer_get() but here we forget to take the process wide cputime sample before caching it. As a result we are storing random stack content, leading every further reads of that timer to return junk values. Fix this by taking the appropriate sample in the case of process wide timers. This probably doesn't matter much in practice because, at this stage, the thread is the last one in the group and we reached exit_notify(). This implies that we called exit_itimers() and there should be no more timers to handle for that task. So this is likely dead code anyway but let's fix the current logic and the warning that came along: kernel/posix-cpu-timers.c: In function 'posix_cpu_timer_schedule': kernel/posix-cpu-timers.c:1127: warning: 'now' may be used uninitialized in this function Then we can start to think further about cleaning up that code. Reported-by: Andrew Morton Reported-by: Chen Gang Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Chen Gang Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 92a4fbf44f86..4ebd8ad07c66 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1097,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } -- cgit v1.2.3 From 4fdc50f9ce5289b633386a2025cf94fb42572baa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 May 2013 10:37:00 +1000 Subject: posix_timers: fix racy timer delta caching on task exit When a task exits, we perform a caching of the remaining cputime delta before expiring of its timers. This is done from the following places: * When the task is reaped. We iterate through its list of posix cpu timers and store the remaining timer delta to the timer struct instead of the absolute value. (See posix_cpu_timers_exit() / posix_cpu_timers_exit_group() ) * When we call posix_cpu_timer_get() or posix_cpu_timer_schedule(). If the timer's task is considered dying when watched from these places, the same conversion from absolute to relative expiry time is performed. Then the given task's reference is released. (See clear_dead_task() ). The relevance of this caching is questionable but this is another and deeper debate. The big issue here is that these two sources of caching don't mix up very well together. More specifically, the caching can easily be done twice, resulting in a wrong delta as it gets spuriously substracted a second time by the elapsed clock. This can happen in the following scenario: 1) The task exits and gets reaped: we call posix_cpu_timers_exit() and the absolute timer expiry values are converted to a relative delta. 2) timer_gettime() -> posix_cpu_timer_get() is called and relies on clear_dead_task() because tsk->exit_state == EXIT_DEAD. The delta gets substracted again by the elapsed clock and we return a wrong result. To fix this, just remove the caching done on task reaping time. It doesn't bring much value on its own. The caching done from posix_cpu_timer_get/schedule is enough. And it would also be hard to get it really right: we could make it put and clear the target task in the timer struct so that readers know if they are dealing with a relative cached of absolute value. But it would be racy. The only safe way to do it would be to lock the itimer->it_lock so that we know nobody reads the cputime expiry value while we modify it and its target task reference. Doing so would involve some funny workarounds to avoid circular lock against the sighand lock. There is just no reason to maintain this. The user visible effect of this patch can be observed by running the following code: it creates a subthread that launches a posix cputimer which expires after 10 seconds. But then the subthread only busy loops for 2 seconds and exits. The parent reaps the subthread and read the timer value. Its expected value should the be the initial timer's expiration value minus the cputime elapsed in the subthread. Roughly 10 - 2 = 8 seconds: #include #include #include #include #include static timer_t id; static struct itimerspec val = { .it_value.tv_sec = 10, }, new; static void *thread(void *unused) { int err; struct timeval start, end, diff; timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &id); if (err < 0) { perror("Can't create timer\n"); return NULL; } /* Arm 10 sec timer */ err = timer_settime(id, 0, &val, NULL); if (err < 0) { perror("Can't set timer\n"); return NULL; } /* Exit after 2 seconds of execution */ gettimeofday(&start, NULL); do { gettimeofday(&end, NULL); timersub(&end, &start, &diff); } while (diff.tv_sec < 2); return NULL; } int main(int argc, char **argv) { pthread_t pthread; int err; err = pthread_create(&pthread, NULL, thread, NULL); if (err) { perror("Can't create thread\n"); return -1; } pthread_join(pthread, NULL); /* Just wait a little bit to make sure the child got reaped */ sleep(1); err = timer_gettime(id, &new); if (err) perror("Can't get timer value\n"); printf("%d %ld\n", new.it_value.tv_sec, new.it_value.tv_nsec); return 0; } Before the patch: $ ./posix_cpu_timers 6 2278074 After the patch: $ ./posix_cpu_timers 8 1158766 Before the patch, the elapsed time got two more seconds spuriously accounted. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4ebd8ad07c66..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -404,14 +404,8 @@ static void cleanup_timers_list(struct list_head *head, { struct cpu_timer_list *timer, *next; - list_for_each_entry_safe(timer, next, head, entry) { + list_for_each_entry_safe(timer, next, head, entry) list_del_init(&timer->entry); - if (timer->expires < curr) { - timer->expires = 0; - } else { - timer->expires -= curr; - } - } } /* @@ -459,15 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, unsigned long long now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires -= now; + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) -- cgit v1.2.3 From 902314a517e50d80c0004a990bb46b75fa8ae8e5 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 23 May 2013 10:37:00 +1000 Subject: drivers/infiniband/core/cm.c: convert to using idr_alloc_cyclic() commit 3e6628c4b347 ("idr: introduce idr_alloc_cyclic()") adds a new idr_alloc_cyclic routine and converts several of these users to it. This is just a missed one - add it. Signed-off-by: Zhao Hongjiang Cc: Roland Dreier Cc: Tejun Heo Cc: Sean Hefty Signed-off-by: Andrew Morton --- drivers/infiniband/core/cm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 784b97cb05b0..f2ef7ef0f36f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -383,14 +383,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; int id; - static int next_id; idr_preload(GFP_KERNEL); spin_lock_irqsave(&cm.lock, flags); - id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT); - if (id >= 0) - next_id = max(id + 1, 0); + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&cm.lock, flags); idr_preload_end(); -- cgit v1.2.3 From bb91a23b08f8b3e98769175165fc098e5709c8bf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 May 2013 10:37:01 +1000 Subject: configfs: use capped length for ->store_attribute() The difference between "count" and "len" is that "len" is capped at 4095. Changing it like this makes it match how sysfs_write_file() is implemented. This is a static analysis patch. I haven't found any store_attribute() functions where this change makes a difference. Signed-off-by: Dan Carpenter Acked-by: Joel Becker Signed-off-by: Andrew Morton --- fs/configfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2b6cb23dd14e..1d1c41f1014d 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, count); + len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); -- cgit v1.2.3 From a1db5634519819b26ad0465f6e10031c8ab848e9 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Thu, 23 May 2013 10:37:01 +1000 Subject: ipvs: change type of netns_ipvs->sysctl_sync_qlen_max This member of struct netns_ipvs is calculated from nr_free_buffer_pages so change its type to unsigned long in case of overflow. Also, type of its related proc var sync_qlen_max and the return type of function sysctl_sync_qlen_max() should be changed to unsigned long, too. Besides, the type of ipvs_master_sync_state->sync_queue_len should be changed to unsigned long accordingly. Signed-off-by: Zhang Yanfei Cc: Simon Horman Cc: Julian Anastasov Cc: David Miller Signed-off-by: Andrew Morton --- include/net/ip_vs.h | 8 ++++---- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4c062ccff9aa..4405886980c7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -905,7 +905,7 @@ struct ip_vs_app { struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; - int sync_queue_len; + unsigned long sync_queue_len; unsigned int sync_queue_delay; struct task_struct *master_thread; struct delayed_work master_wakeup_work; @@ -998,7 +998,7 @@ struct netns_ipvs { int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; - int sysctl_sync_qlen_max; + unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; @@ -1085,7 +1085,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return ACCESS_ONCE(ipvs->sysctl_sync_ports); } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } @@ -1138,7 +1138,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return 1; } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7c3ed429789e..df05c1c276f0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1716,9 +1716,9 @@ static struct ctl_table vs_vars[] = { }, { .procname = "sync_qlen_max", - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", -- cgit v1.2.3 From ab3534c8c36654332c63ef6115860380bbe31cf0 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 23 May 2013 10:37:01 +1000 Subject: fs/ocfs2/namei.c: remove unecessary ERROR when removing non-empty directory While removing a non-empty directory, the kernel dumps a message: (rmdir,21743,1):ocfs2_unlink:953 ERROR: status = -39 Suppress the error message from being printed in the dmesg so users don't panic. Signed-off-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b57c243..33c7b91c777b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -947,7 +947,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; -- cgit v1.2.3 From 803daf75dfdb6f7db1fcb4e497068bb920e23d63 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 May 2013 10:37:02 +1000 Subject: softirq: use _RET_IP_ Use the already defined macro to pass the function return address. Signed-off-by: Davidlohr Bueso Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/softirq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..a5f88362589b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_DISABLE_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); @@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt) WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); + trace_softirqs_on(_RET_IP_); sub_preempt_count(cnt); } @@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) void local_bh_enable(void) { - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + _local_bh_enable_ip(_RET_IP_); } EXPORT_SYMBOL(local_bh_enable); @@ -223,8 +222,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); -- cgit v1.2.3 From 73ca60ebf1ecbde875e9c3fb3bf159248bc7929b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 23 May 2013 10:37:02 +1000 Subject: lockdep: introduce lock_acquire_exclusive/shared helper macros In lockdep.h, the spinlock/mutex/rwsem/rwlock/lock_map acquire macros have different definitions based on the value of CONFIG_PROVE_LOCKING. We have separate ifdefs for each of these definitions, which seems redundant. Introduce lock_acquire_{exclusive,shared,shared_recursive} helpers which will have different definitions based on CONFIG_PROVE_LOCKING. Then all other helper macros can be defined based on the above ones, which reduces the amount of ifdefined code. Signed-off-by: Michel Lespinasse Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Lai Jiangshan Cc: "Srivatsa S. Bhat" Cc: Rusty Russell Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/lockdep.h | 92 +++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 69 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f1e877b79ed8..cfc2f119779a 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -365,7 +365,7 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#else /* !LOCKDEP */ +#else /* !CONFIG_LOCKDEP */ static inline void lockdep_off(void) { @@ -479,82 +479,36 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# endif -# define spin_release(l, n, i) lock_release(l, n, i) +#ifdef CONFIG_PROVE_LOCKING + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 2, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 2, n, i) #else -# define spin_acquire(l, s, t, i) do { } while (0) -# define spin_release(l, n, i) do { } while (0) + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, NULL, i) -# else -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, NULL, i) -# endif -# define rwlock_release(l, n, i) lock_release(l, n, i) -#else -# define rwlock_acquire(l, s, t, i) do { } while (0) -# define rwlock_acquire_read(l, s, t, i) do { } while (0) -# define rwlock_release(l, n, i) do { } while (0) -#endif +#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define spin_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# endif -# define mutex_release(l, n, i) lock_release(l, n, i) -#else -# define mutex_acquire(l, s, t, i) do { } while (0) -# define mutex_acquire_nest(l, s, t, n, i) do { } while (0) -# define mutex_release(l, n, i) do { } while (0) -#endif +#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) -# else -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) -# endif +#define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define mutex_release(l, n, i) lock_release(l, n, i) + +#define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) # define rwsem_release(l, n, i) lock_release(l, n, i) -#else -# define rwsem_acquire(l, s, t, i) do { } while (0) -# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) -# define rwsem_acquire_read(l, s, t, i) do { } while (0) -# define rwsem_release(l, n, i) do { } while (0) -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_) -# else -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_) -# endif +#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) # define lock_map_release(l) lock_release(l, 1, _THIS_IP_) -#else -# define lock_map_acquire(l) do { } while (0) -# define lock_map_acquire_read(l) do { } while (0) -# define lock_map_release(l) do { } while (0) -#endif #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ -- cgit v1.2.3 From 5a429027b4c58406f1d7f102c8089eca17c16e2a Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 23 May 2013 10:37:03 +1000 Subject: lglock: update lockdep annotations to report recursive local locks Oleg Nesterov recently noticed that the lockdep annotations in lglock.c are not sufficient to detect some obvious deadlocks, such as lg_local_lock(LOCK) + lg_local_lock(LOCK) or spin_lock(X) + lg_local_lock(Y) vs lg_local_lock(Y) + spin_lock(X). Both issues are easily fixed by indicating to lockdep that lglock's local locks are not recursive. We shouldn't use the rwlock acquire/release functions here, as lglock doesn't share the same semantics. Instead we can base our lockdep annotations on the lock_acquire_shared (for local lglock) and lock_acquire_exclusive (for global lglock) helpers. I am not proposing new lglock specific helpers as I don't see the point of the existing second level of helpers :) Signed-off-by: Michel Lespinasse Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Lai Jiangshan Cc: "Srivatsa S. Bhat" Cc: Rusty Russell Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/lglock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_lock(lock); } @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_unlock(lock); preempt_enable(); @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_lock(lock); } @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_unlock(lock); preempt_enable(); @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) int i; preempt_disable(); - rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) { int i; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); -- cgit v1.2.3 From bcc50cd1abeaa1e9c3573b835d7d6ff175e63558 Mon Sep 17 00:00:00 2001 From: Libo Chen Date: Thu, 23 May 2013 10:37:03 +1000 Subject: drivers/cdrom/gdrom.c: fix device number leak Without this patch, gdrom_major will leak when gd.cd_info alloc fails. Signed-off-by: Libo Chen Cc: Jens Axboe Cc: Tejun Heo Signed-off-by: Andrew Morton --- drivers/cdrom/gdrom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 4afcb65cc623..5980cb9af857 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -830,9 +830,9 @@ probe_fail_cdrom_register: del_gendisk(gd.disk); probe_fail_no_disk: kfree(gd.cd_info); +probe_fail_no_mem: unregister_blkdev(gdrom_major, GDROM_DEV_NAME); gdrom_major = 0; -probe_fail_no_mem: pr_warning("Probe failed - error is 0x%X\n", err); return err; } -- cgit v1.2.3 From 0a237c458721feb40874d2c137a7c1e0779d2c33 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 23 May 2013 10:37:03 +1000 Subject: block/compat_ioctl.c: do not leak info to user-space There is a hole in struct hd_geometry, so we have to zero the struct on stack before copying it to user-space. Signed-off-by: Cong Wang Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/compat_ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7c668c8a6f95..7e5d474dc6ba 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -59,6 +59,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, if (!disk->fops->getgeo) return -ENOTTY; + memset(&geo, 0, sizeof(geo)); /* * We need to set the startsect first, the driver may * want to override it. -- cgit v1.2.3 From 90fc129eb507a5d46b787a5527ee645b0eee0558 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Thu, 23 May 2013 10:37:04 +1000 Subject: block: restore /proc/partitions to not display non-partitionable removable devices We found with newer kernels we started seeing the cdrom device showing up in /proc/partitions, but it was not there before. Looking into this I found that commit d27769ec ("block: add GENHD_FL_NO_PART_SCAN") introduces this change in behavior. It's not clear to me from the commit's changelog if this change was intentional or not. This comment still remains: /* Don't show non-partitionable removeable devices or empty devices */ so I've decided to send a patch to restore the behavior of not printing unpartitionable removable devices. Signed-off-by: Josh Hunt Cc: Tejun Heo Cc: Kay Sievers Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index e9094b375c05..2e1cfe31f080 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) -- cgit v1.2.3 From 028eea4c9a80c712b9d1f04bab784030b59e1496 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 23 May 2013 10:37:05 +1000 Subject: watchdog: trigger all-cpu backtrace when locked up and going to panic Send an NMI to all CPUs when a lockup is detected and the lockup watchdog code is configured to panic. This gives us a fairly uptodate snapshot of all CPUs in the system. This lets us get stack trace of all CPUs which makes life easier trying to debug a deadlock, and the NMI doesn't change anything since the next step is a kernel panic. Signed-off-by: Sasha Levin Cc: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/watchdog.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f07..ea741c32d596 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) + if (hardlockup_panic) { + trigger_all_cpu_backtrace(); panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else + } else { WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + } __this_cpu_write(hard_watchdog_warn, true); return; @@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); - if (softlockup_panic) + if (softlockup_panic) { + trigger_all_cpu_backtrace(); panic("softlockup: hung tasks"); + } __this_cpu_write(soft_watchdog_warn, true); } else __this_cpu_write(soft_watchdog_warn, false); -- cgit v1.2.3 From d86e4b0c6cf6bb0617534eb36ca6e0e2d924924f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:05 +1000 Subject: clear_refs: sanitize accepted commands declaration This is the implementation of the soft-dirty bit concept that should help keep track of changes in user memory, which in turn is very-very required by the checkpoint-restore project (http://criu.org). To create a dump of an application(s) we save all the information about it to files, and the biggest part of such dump is the contents of tasks' memory. However, there are usage scenarios where it's not required to get _all_ the task memory while creating a dump. For example, when doing periodical dumps, it's only required to take full memory dump only at the first step and then take incremental changes of memory. Another example is live migration. We copy all the memory to the destination node without stopping all tasks, then stop them, check for what pages has changed, dump it and the rest of the state, then copy it to the destination node. This decreases freeze time significantly. That said, some help from kernel to watch how processes modify the contents of their memory is required. The proposal is to track changes with the help of new soft-dirty bit this way: 1. First do "echo 4 > /proc/$pid/clear_refs". At that point kernel clears the soft dirty _and_ the writable bits from all ptes of process $pid. From now on every write to any page will result in #pf and the subsequent call to pte_mkdirty/pmd_mkdirty, which in turn will set the soft dirty flag. 2. Then read the /proc/$pid/pagemap2 and check the soft-dirty bit reported there (the 55'th one). If set, the respective pte was written to since last call to clear refs. The soft-dirty bit is the _PAGE_BIT_HIDDEN one. Although it's used by kmemcheck, the latter one marks kernel pages with it, while the former bit is put on user pages so they do not conflict to each other. This patch: A new clear-refs type will be added in the next patch, so prepare code for that. [akpm@linux-foundation.org: don't assume that sizeof(enum clear_refs_types) == sizeof(int)] Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d864d56..dad0809db551 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -688,6 +688,13 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_LAST, +}; + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -719,10 +726,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +733,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,10 +742,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) -- cgit v1.2.3 From bf7d4ef630ace704759a1343cfa36337acaf368e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:06 +1000 Subject: clear_refs: introduce private struct for mm_walk In the next patch the clear-refs-type will be required in clear_refs_pte_range funciton, so prepare the walk->private to carry this info. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dad0809db551..ef6f6c62dfee 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -695,10 +695,15 @@ enum clear_refs_types { CLEAR_REFS_LAST, }; +struct clear_refs_private { + struct vm_area_struct *vma; +}; + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -753,13 +758,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* -- cgit v1.2.3 From 2c79a7a64cce4d3b60f27efb70bc32b38d11f14a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:06 +1000 Subject: pagemap: introduce pagemap_entry_t without pmshift bits These bits are always constant (== PAGE_SHIFT) and just occupy space in the entry. Moreover, in next patch we will need to report one more bit in the pagemap, but all bits are already busy on it. That said, describe the pagemap entry that has 6 more free zero bits. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef6f6c62dfee..41a2af429af4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -807,6 +807,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -820,14 +821,16 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in pagemap2 pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -850,7 +853,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -860,7 +863,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; @@ -879,18 +882,18 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pmd_t pmd, int offset) { /* @@ -900,12 +903,12 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pmd_t pmd, int offset) { } @@ -918,7 +921,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -928,7 +931,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -945,7 +948,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -953,7 +956,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -968,14 +971,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -989,7 +992,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1025,8 +1028,8 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t do_pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, bool v2) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; @@ -1051,6 +1054,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = v2; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1123,6 +1127,12 @@ out: return ret; } +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, false); +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, -- cgit v1.2.3 From 25461823ad484c62cb7cb8dce2d996d44e5789e2 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:06 +1000 Subject: pagemap-introduce-pagemap_entry_t-without-pmshift-bits-v4 Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 41a2af429af4..39d641292579 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -824,7 +824,7 @@ struct pagemapread { #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -/* in pagemap2 pshift bits are occupied with more status bits */ +/* in "new" pagemap pshift bits are occupied with more status bits */ #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) #define PM_PRESENT PM_STATUS(4LL) @@ -1028,8 +1028,8 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t do_pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos, bool v2) +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; @@ -1054,7 +1054,7 @@ static ssize_t do_pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; - pm.v2 = v2; + pm.v2 = false; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1127,12 +1127,6 @@ out: return ret; } -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - return do_pagemap_read(file, buf, count, ppos, false); -} - const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, -- cgit v1.2.3 From 4abc666420b18fcb82bfc92239ca8d0ded092636 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:07 +1000 Subject: mm: soft-dirty bits for user memory changes tracking The soft-dirty is a bit on a PTE which helps to track which pages a task writes to. In order to do this tracking one should 1. Clear soft-dirty bits from PTEs ("echo 4 > /proc/PID/clear_refs) 2. Wait some time. 3. Read soft-dirty bits (55'th in /proc/PID/pagemap2 entries) To do this tracking, the writable bit is cleared from PTEs when the soft-dirty bit is. Thus, after this, when the task tries to modify a page at some virtual address the #PF occurs and the kernel sets the soft-dirty bit on the respective PTE. Note, that although all the task's address space is marked as r/o after the soft-dirty bits clear, the #PF-s that occur after that are processed fast. This is so, since the pages are still mapped to physical memory, and thus all the kernel does is finds this fact out and puts back writable, dirty and soft-dirty bits on the PTE. Another thing to note, is that when mremap moves PTEs they are marked with soft-dirty as well, since from the user perspective mremap modifies the virtual memory at mremap's new address. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.txt | 7 +++++- Documentation/vm/soft-dirty.txt | 36 +++++++++++++++++++++++++++++++ arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 24 +++++++++++++++++++-- arch/x86/include/asm/pgtable_types.h | 12 +++++++++++ fs/proc/task_mmu.c | 42 +++++++++++++++++++++++++++++++----- include/asm-generic/pgtable.h | 22 +++++++++++++++++++ mm/Kconfig | 12 +++++++++++ mm/huge_memory.c | 2 +- mm/mremap.c | 2 +- 11 files changed, 153 insertions(+), 10 deletions(-) create mode 100644 Documentation/vm/soft-dirty.txt diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fd8d0d594fc7..fcc22c982a25 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is enabled. The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG -bits on both physical and virtual pages associated with a process. +bits on both physical and virtual pages associated with a process, and the +soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details). To clear the bits for all the pages associated with the process > echo 1 > /proc/PID/clear_refs @@ -482,6 +483,10 @@ To clear the bits for the anonymous pages associated with the process To clear the bits for the file mapped pages associated with the process > echo 3 > /proc/PID/clear_refs + +To clear the soft-dirty bit + > echo 4 > /proc/PID/clear_refs + Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt new file mode 100644 index 000000000000..9a12a5956bc0 --- /dev/null +++ b/Documentation/vm/soft-dirty.txt @@ -0,0 +1,36 @@ + SOFT-DIRTY PTEs + + The soft-dirty is a bit on a PTE which helps to track which pages a task +writes to. In order to do this tracking one should + + 1. Clear soft-dirty bits from the task's PTEs. + + This is done by writing "4" into the /proc/PID/clear_refs file of the + task in question. + + 2. Wait some time. + + 3. Read soft-dirty bits from the PTEs. + + This is done by reading from the /proc/PID/pagemap. The bit 55 of the + 64-bit qword is the soft-dirty one. If set, the respective PTE was + written to since step 1. + + + Internally, to do this tracking, the writable bit is cleared from PTEs +when the soft-dirty bit is cleared. So, after this, when the task tries to +modify a page at some virtual address the #PF occurs and the kernel sets +the soft-dirty bit on the respective PTE. + + Note, that although all the task's address space is marked as r/o after the +soft-dirty bits clear, the #PF-s that occur after that are processed fast. +This is so, since the pages are still mapped to physical memory, and thus all +the kernel does is finds this fact out and puts both writable and soft-dirty +bits on the PTE. + + + This feature is actively used by the checkpoint-restore project. You +can find more details about it on http://criu.org + + +-- Pavel Emelyanov, Apr 9, 2013 diff --git a/arch/Kconfig b/arch/Kconfig index a4429bcd609e..8d2ae24b9f4a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -365,6 +365,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config HAVE_ARCH_SOFT_DIRTY + bool + config HAVE_MOD_ARCH_SPECIFIC bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ae917f3965f1..f2eccdd64a92 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE + select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA if X86_64 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e672234c4ff..ebf937362479 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY); + return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY); + return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) @@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +static inline int pte_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..c98ac63aae48 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -55,6 +55,18 @@ #define _PAGE_HIDDEN (_AT(pteval_t, 0)) #endif +/* + * The same hidden bit is used by kmemcheck, but since kmemcheck + * works on kernel pages while soft-dirty engine on user space, + * they do not conflict with each other. + */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 39d641292579..9238acbc0a10 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -692,13 +692,32 @@ enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_LAST, }; struct clear_refs_private { struct vm_area_struct *vma; + enum clear_refs_types type; }; +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -718,6 +737,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -759,6 +783,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, mm = get_task_mm(task); if (mm) { struct clear_refs_private cp = { + .type = type, }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, @@ -827,6 +852,7 @@ struct pagemapread { /* in "new" pagemap pshift bits are occupied with more status bits */ #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) @@ -868,6 +894,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -888,13 +915,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset) + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -903,13 +932,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, 0) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset) + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -926,12 +955,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a59ff51b0166..347fb7bfcbe2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -396,6 +396,28 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_start_context_switch(prev) do {} while (0) #endif +#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline int pte_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return 0; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd; +} +#endif + #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of diff --git a/mm/Kconfig b/mm/Kconfig index f5e698e30d4a..7e28ecfa8aa4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -477,3 +477,15 @@ config FRONTSWAP and swap data is stored as normal on the matching swap device. If unsure, say Y to enable frontswap. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329b83fe..d8b3b850150c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (ret == 1) { pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(&mm->page_table_lock); } out: diff --git a/mm/mremap.c b/mm/mremap.c index 463a25705ac6..3708655378e9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); } arch_leave_lazy_mmu_mode(); -- cgit v1.2.3 From 1649f7839bbaddb527960afc2ee2b390c8c29b15 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:07 +1000 Subject: soft-dirty: call mmu notifiers when write-protecting ptes As noticed by Xiao, since soft-dirty clear command modifies page tables we have to flush tlbs and call mmu notifiers. While the former is done by the clear_refs engine itself, the latter is to be done. One thing to note about this -- in order not to call per-page invalidate notifier (_all_ address space is about to be changed), the _invalidate_range_start and _end are used. But for those start and end are not known exactly. To address this, the same trick as in exit_mmap() is used -- start is 0 and end is (unsigned long)-1. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9238acbc0a10..a18e065c1c3e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -791,6 +792,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { cp.vma = vma; if (is_vm_hugetlb_page(vma)) @@ -811,6 +814,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); -- cgit v1.2.3 From d99caa0ead24ccbec41b6626e99e0d4916d51851 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 23 May 2013 10:37:07 +1000 Subject: pagemap: prepare to reuse constant bits with page-shift In order to reuse bits from pagemap entries gracefully, we leave the entries as is but on pagemap open emit a warning in dmesg, that bits 55-60 are about to change in a couple of releases. Next, if a user issues soft-dirty clear command via the clear_refs file (it was disabled before v3.9) we assume that he's aware of the new pagemap format, note that fact and report the bits in pagemap in the new manner. The "migration strategy" looks like this then: 1. existing users are not affected -- they don't touch soft-dirty feature, thus see old bits in pagemap, but are warned and have time to fix themselves 2. those who use soft-dirty know about new pagemap format 3. some time soon we get rid of any signs of page-shift in pagemap as well as this trick with clear-soft-dirty affecting pagemap format. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/vm/pagemap.txt | 3 ++- fs/proc/task_mmu.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 5ef3dd38bb64..5948e455c4d2 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -15,7 +15,8 @@ There are three components to pagemap: * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bits 55-60 page shift (page size = 1<= CLEAR_REFS_LAST) return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -1091,7 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; - pm.v2 = false; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1164,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ -- cgit v1.2.3 From 2ad8b640372dcaf3ff4f641059294cd99d40ef30 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 23 May 2013 10:37:08 +1000 Subject: mm/thp: use the correct function when updating access flags We should use pmdp_set_access_flags to update access flags. Archs like powerpc use extra checks(_PAGE_BUSY) when updating a hugepage PTE. A set_pmd_at doesn't do those checks. We should use set_pmd_at only when updating a none hugepage PTE. Signed-off-by: Aneesh Kumar K.V Cc: Andrea Arcangeli a Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d8b3b850150c..4347eb3a00ff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { -- cgit v1.2.3 From 3b1ab9898b81fc2dc29cf47697394a607b4e5832 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 23 May 2013 10:37:08 +1000 Subject: mm, memcg: don't take task_lock in task_in_mem_cgroup For processes that have detached their mm's, task_in_mem_cgroup() unnecessarily takes task_lock() when rcu_read_lock() is all that is necessary to call mem_cgroup_from_task(). While we're here, switch task_in_mem_cgroup() to return bool. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 9 +++++---- mm/memcontrol.c | 11 ++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d6183f06d8c1..7b4d9d79570b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -273,10 +274,10 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline int task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +static inline bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - return 1; + return true; } static inline struct cgroup_subsys_state diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 010d6c14129a..caff46388129 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1450,11 +1450,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { @@ -1466,14 +1467,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } if (!curr) - return 0; + return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is -- cgit v1.2.3 From c2bceb66b283b629093b8323251dae3c926e7ec0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 23 May 2013 10:37:08 +1000 Subject: mm: remove free_area_cache Since all architectures have been converted to use vm_unmapped_area(), there is no remaining use for the free_area_cache. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: "James E.J. Bottomley" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Paul Mackerras Cc: Richard Henderson Signed-off-by: Andrew Morton --- arch/arm/mm/mmap.c | 2 -- arch/arm64/mm/mmap.c | 2 -- arch/mips/mm/mmap.c | 2 -- arch/powerpc/mm/mmap_64.c | 2 -- arch/s390/mm/mmap.c | 4 ---- arch/sparc/kernel/sys_sparc_64.c | 2 -- arch/tile/mm/mmap.c | 2 -- arch/x86/ia32/ia32_aout.c | 2 -- arch/x86/mm/mmap.c | 2 -- fs/binfmt_aout.c | 2 -- fs/binfmt_elf.c | 2 -- include/linux/mm_types.h | 3 --- include/linux/sched.h | 2 -- kernel/fork.c | 4 ---- mm/mmap.c | 28 ---------------------------- mm/nommu.c | 4 ---- mm/util.c | 1 - 17 files changed, 66 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 10062ceadd1c..0c6356255fe3 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7c7be7855638..8ed6cb1a900f 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 7e5fe2790d8a..f1baadd56e82 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c index 67a42ed0d2fc..cb8bdbe4972f 100644 --- a/arch/powerpc/mm/mmap_64.c +++ b/arch/powerpc/mm/mmap_64.c @@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 06bafec00278..40023290ee5b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } @@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = s390_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = s390_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 2daaaa6eda23..51561b8b15ba 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c index f96f4cec602a..d67d91ebf63e 100644 --- a/arch/tile/mm/mmap.c +++ b/arch/tile/mm/mmap.c @@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(mm); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 805078e08013..fbeba82c703b 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df6835f9f..62c29a5bfe26 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bce87694f7b0..89dec7f789a4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8a0b0efda44..100edcc5e312 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ace9a5f01c64..fb425aa16c01 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -330,12 +330,9 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); - void (*unmap_area) (struct mm_struct *mm, unsigned long addr); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ - unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ - unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 178a8d909f14..b3612503ca4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -322,8 +322,6 @@ extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern void arch_unmap_area(struct mm_struct *, unsigned long); -extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif diff --git a/kernel/fork.c b/kernel/fork.c index 987b28a1f01b..6fcc2e24561e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/mm/mmap.c b/mm/mmap.c index ec75bb0681f9..c0f0558f7057 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1876,15 +1876,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1941,19 +1932,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -2374,7 +2352,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2391,11 +2368,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } diff --git a/mm/nommu.c b/mm/nommu.c index 298884dcd6e7..886e07ce4d1d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1869,10 +1869,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/util.c b/mm/util.c index ab1424dbe2e6..7441c41d00f6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif -- cgit v1.2.3 From 19a81da1311bde27c013da210b44cfcbf925e224 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 23 May 2013 10:37:09 +1000 Subject: mm: remove compressed copy from zram in-memory Swap subsystem does lazy swap slot free with expecting the page would be swapped out again so we can avoid unnecessary write. But the problem in in-memory swap(ex, zram) is that it consumes memory space until vm_swap_full(ie, used half of all of swap device) condition meet. It could be bad if we use multiple swap device, small in-memory swap and big storage swap or in-memory swap alone. This patch makes swap subsystem free swap slot as soon as swap-read is completed and make the swapcache page dirty so the page should be written out the swap device to reclaim it. It means we never lose it. I tested this patch with kernel compile workload. 1. before compile time : 9882.42 zram max wasted space by fragmentation: 13471881 byte memory space consumed by zram: 174227456 byte the number of slot free notify: 206684 2. after compile time : 9653.90 zram max wasted space by fragmentation: 11805932 byte memory space consumed by zram: 154001408 byte the number of slot free notify: 426972 * changelog from v3 * Rebased on next-20130508 * changelog from v1 * Add more comment Signed-off-by: Dan Magenheimer Signed-off-by: Minchan Kim Cc: Hugh Dickins Cc: Seth Jennings Cc: Nitin Gupta Cc: Konrad Rzeszutek Wilk Cc: Shaohua Li Signed-off-by: Andrew Morton --- mm/page_io.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/mm/page_io.c b/mm/page_io.c index a8a3ef45fed7..736ffac48030 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -21,6 +21,7 @@ #include #include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -81,8 +82,42 @@ void end_swap_bio_read(struct bio *bio, int err) iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_sector); } else { + struct swap_info_struct *sis; + SetPageUptodate(page); + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * Swap subsystem does lazy swap slot free with + * expecting the page would be swapped out again + * so we can avoid unnecessary write if the page + * isn't redirty. + * It's good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if you use SSD as swap device. + * But if you use in-memory swap device(ex, zram), + * it causes duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty* + * so the page should be swap out somewhere again if + * we want to reclaim it, again. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } } + unlock_page(page); bio_put(bio); } -- cgit v1.2.3 From d1c44820a52a1a0e31d7f02a63120aec50e444a8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:09 +1000 Subject: mm-remove-compressed-copy-from-zram-in-memory-fix tweak comment text Cc: Dan Magenheimer Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Cc: Nitin Gupta Cc: Seth Jennings Cc: Shaohua Li Signed-off-by: Andrew Morton --- mm/page_io.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 736ffac48030..1897abb8fdf6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -88,20 +88,20 @@ void end_swap_bio_read(struct bio *bio, int err) sis = page_swap_info(page); if (sis->flags & SWP_BLKDEV) { /* - * Swap subsystem does lazy swap slot free with - * expecting the page would be swapped out again - * so we can avoid unnecessary write if the page - * isn't redirty. - * It's good for real swap storage because we can + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can * reduce unnecessary I/O and enhance wear-leveling - * if you use SSD as swap device. - * But if you use in-memory swap device(ex, zram), - * it causes duplicated copy between uncompressed + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty* - * so the page should be swap out somewhere again if - * we want to reclaim it, again. + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. */ struct gendisk *disk = sis->bdev->bd_disk; if (disk->fops->swap_slot_free_notify) { -- cgit v1.2.3 From b40c59d810fe6422bf3e53cda6c5b6e622f018ec Mon Sep 17 00:00:00 2001 From: Tomasz Stanislawski Date: Thu, 23 May 2013 10:37:09 +1000 Subject: mm/page_alloc.c: fix watermark check in __zone_watermark_ok() The watermark check consists of two sub-checks. The first one is: if (free_pages <= min + lowmem_reserve) return false; The check assures that there is minimal amount of RAM in the zone. If CMA is used then the free_pages is reduced by the number of free pages in CMA prior to the over-mentioned check. if (!(alloc_flags & ALLOC_CMA)) free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); This prevents the zone from being drained from pages available for non-movable allocations. The second check prevents the zone from getting too fragmented. for (o = 0; o < order; o++) { free_pages -= z->free_area[o].nr_free << o; min >>= 1; if (free_pages <= min) return false; } The field z->free_area[o].nr_free is equal to the number of free pages including free CMA pages. Therefore the CMA pages are subtracted twice. This may cause a false positive fail of __zone_watermark_ok() if the CMA area gets strongly fragmented. In such a case there are many 0-order free pages located in CMA. Those pages are subtracted twice therefore they will quickly drain free_pages during the check against fragmentation. The test fails even though there are many free non-cma pages in the zone. This patch fixes this issue by subtracting CMA pages only for a purpose of (free_pages <= min + lowmem_reserve) check. Signed-off-by: Tomasz Stanislawski Signed-off-by: Kyungmin Park Cc: Bartlomiej Zolnierkiewicz Cc: Minchan Kim Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 378a15bcd649..c3edb624fccf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1628,6 +1628,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; + long free_cma = 0; free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) @@ -1637,9 +1638,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages <= min + lowmem_reserve) + + if (free_pages - free_cma <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ -- cgit v1.2.3 From c723498020a2b3cc789e89237c5fa81cf5e46eb1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 23 May 2013 10:37:10 +1000 Subject: swap: add a simple detector for inappropriate swapin readahead This is a patch to improve swap readahead algorithm. It's from Hugh and I slightly changed it. Hugh's original changelog: swapin readahead does a blind readahead, whether or not the swapin is sequential. This may be ok on harddisk, because large reads have relatively small costs, and if the readahead pages are unneeded they can be reclaimed easily - though, what if their allocation forced reclaim of useful pages? But on SSD devices large reads are more expensive than small ones: if the readahead pages are unneeded, reading them in caused significant overhead. This patch adds very simplistic random read detection. Stealing the PageReadahead technique from Konstantin Khlebnikov's patch, avoiding the vma/anon_vma sophistications of Shaohua Li's patch, swapin_nr_pages() simply looks at readahead's current success rate, and narrows or widens its readahead window accordingly. There is little science to its heuristic: it's about as stupid as can be whilst remaining effective. The table below shows elapsed times (in centiseconds) when running a single repetitive swapping load across a 1000MB mapping in 900MB ram with 1GB swap (the harddisk tests had taken painfully too long when I used mem=500M, but SSD shows similar results for that). Vanilla is the 3.6-rc7 kernel on which I started; Shaohua denotes his Sep 3 patch in mmotm and linux-next; HughOld denotes my Oct 1 patch which Shaohua showed to be defective; HughNew this Nov 14 patch, with page_cluster as usual at default of 3 (8-page reads); HughPC4 this same patch with page_cluster 4 (16-page reads); HughPC0 with page_cluster 0 (1-page reads: no readahead). HDD for swapping to harddisk, SSD for swapping to VertexII SSD. Seq for sequential access to the mapping, cycling five times around; Rand for the same number of random touches. Anon for a MAP_PRIVATE anon mapping; Shmem for a MAP_SHARED anon mapping, equivalent to tmpfs. One weakness of Shaohua's vma/anon_vma approach was that it did not optimize Shmem: seen below. Konstantin's approach was perhaps mistuned, 50% slower on Seq: did not compete and is not shown below. HDD Vanilla Shaohua HughOld HughNew HughPC4 HughPC0 Seq Anon 73921 76210 75611 76904 78191 121542 Seq Shmem 73601 73176 73855 72947 74543 118322 Rand Anon 895392 831243 871569 845197 846496 841680 Rand Shmem 1058375 1053486 827935 764955 764376 756489 SSD Vanilla Shaohua HughOld HughNew HughPC4 HughPC0 Seq Anon 24634 24198 24673 25107 21614 70018 Seq Shmem 24959 24932 25052 25703 22030 69678 Rand Anon 43014 26146 28075 25989 26935 25901 Rand Shmem 45349 45215 28249 24268 24138 24332 These tests are, of course, two extremes of a very simple case: under heavier mixed loads I've not yet observed any consistent improvement or degradation, and wider testing would be welcome. Shaohua Li: Test shows Vanilla is slightly better in sequential workload than Hugh's patch. I observed with Hugh's patch sometimes the readahead size is shrinked too fast (from 8 to 1 immediately) in sequential workload if there is no hit. And in such case, continuing doing readahead is good actually. I don't prepare a sophisticated algorithm for the sequential workload because so far we can't guarantee sequential accessed pages are swap out sequentially. So I slightly change Hugh's heuristic - don't shrink readahead size too fast. Here is my test result (unit second, 3 runs average): Vanilla Hugh New Seq 356 370 360 Random 4525 2447 2444 Attached graph is the swapin/swapout throughput I collected with 'vmstat 2'. The first part is running a random workload (till around 1200 of the x-axis) and the second part is running a sequential workload. swapin and swapout throughput are almost identical in steady state in both workloads. These are expected behavior. while in Vanilla, swapin is much bigger than swapout especially in random workload (because wrong readahead). Original patches by: Shaohua Li and Konstantin Khlebnikov. Signed-off-by: Hugh Dickins Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Cc: Minchan Kim Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 +-- mm/swap_state.c | 63 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6d53675c2b54..f1a5b5937be4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) PAGEFLAG(MappedToDisk, mappedtodisk) -/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ +/* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ +PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) #ifdef CONFIG_HIGHMEM /* diff --git a/mm/swap_state.c b/mm/swap_state.c index b3d40dcf3624..7a6d3b1a7d67 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -373,6 +378,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -396,11 +445,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -414,10 +468,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } -- cgit v1.2.3 From 3280b1e1b96c9d17d613e7ad881cf60375c0b436 Mon Sep 17 00:00:00 2001 From: Libin Date: Thu, 23 May 2013 10:37:10 +1000 Subject: mm: use vma_pages() to replace (vm_end - vm_start) >> PAGE_SHIFT (*->vm_end - *->vm_start) >> PAGE_SHIFT operation is implemented as a inline funcion vma_pages() in linux/mm.h, so using it. Signed-off-by: Libin Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6dc1882fbd72..44bb67b810c9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2913,7 +2913,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, details->first_index, details->last_index) { vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + vma_pages(vma) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) diff --git a/mm/mmap.c b/mm/mmap.c index c0f0558f7057..fcedf517e61d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } -- cgit v1.2.3 From 8661f9b94cf8657cf99ebbc4f6e3a499cd6a52e1 Mon Sep 17 00:00:00 2001 From: Libin Date: Thu, 23 May 2013 10:37:10 +1000 Subject: ncpfs: use vma_pages() to replace (vm_end - vm_start) >> PAGE_SHIFT (*->vm_end - *->vm_start) >> PAGE_SHIFT operation is implemented as a inline funcion vma_pages() in linux/mm.h, so using it. Signed-off-by: Libin Signed-off-by: Andrew Morton --- fs/ncpfs/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index ee24df5af1f9..3c5dd55d284c 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; /* we do not support files bigger than 4GB... We eventually supports just 4GB... */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff + if (vma_pages(vma) + vma->vm_pgoff > (1U << (32 - PAGE_SHIFT))) return -EFBIG; -- cgit v1.2.3 From 4b2a7af96610c75eea25376ffc8b71c4ada2a001 Mon Sep 17 00:00:00 2001 From: Libin Date: Thu, 23 May 2013 10:37:11 +1000 Subject: uio: use vma_pages() to replace (vm_end - vm_start) >> PAGE_SHIFT (*->vm_end - *->vm_start) >> PAGE_SHIFT operation is implemented as a inline funcion vma_pages() in linux/mm.h, so using it. Signed-off-by: Libin Signed-off-by: Andrew Morton --- drivers/uio/uio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index b645c47501b4..3b96f18593b3 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -677,7 +677,7 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) if (mi < 0) return -EINVAL; - requested_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + requested_pages = vma_pages(vma); actual_pages = ((idev->info->mem[mi].addr & ~PAGE_MASK) + idev->info->mem[mi].size + PAGE_SIZE -1) >> PAGE_SHIFT; if (requested_pages > actual_pages) -- cgit v1.2.3 From 19e34d9bbe867f93bed340482094fb4b8f9f2785 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:11 +1000 Subject: mm/page_alloc: factor out setting of pcp->high and pcp->batch "Problems" with the current code: 1: there is a lack of synchronization in setting ->high and ->batch in percpu_pagelist_fraction_sysctl_handler() 2: stop_machine() in zone_pcp_update() is unnecissary. 3: zone_pcp_update() does not consider the case where percpu_pagelist_fraction is non-zero To fix: 1: add memory barriers, a safe ->batch value, an update side mutex when updating ->high and ->batch, and use ACCESS_ONCE() for ->batch users that expect a stable value. 2: avoid draining pages in zone_pcp_update(), rely upon the memory barriers added to fix #1 3: factor out quite a few functions, and then call the appropriate one. Note that it results in a change to the behavior of zone_pcp_update(), which is used by memory_hotplug. I'm rather certain that I've diserned (and preserved) the essential behavior (changing ->high and ->batch), and only eliminated unneeded actions (draining the per cpu pages), but this may not be the case. Further note that the draining of pages that previously took place in zone_pcp_update() occured after repeated draining when attempting to offline a page, and after the offline has "succeeded". It appears that the draining was added to zone_pcp_update() to avoid refactoring setup_pageset() into 2 funtions. This patch: Creates pageset_set_batch() for use in setup_pageset(). pageset_set_batch() imitates the functionality of setup_pagelist_highmark(), but uses the boot time (percpu_pagelist_fraction == 0) calculations for determining ->high based on ->batch. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fccf..d4bcc20ab6f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4032,6 +4032,14 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* a companion to setup_pagelist_highmark() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp = &p->pcp; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); +} + static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; @@ -4041,8 +4049,7 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } @@ -4051,7 +4058,6 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { -- cgit v1.2.3 From b1c64787df32a24755506972116bb228b77a04cb Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:11 +1000 Subject: mm/page_alloc: prevent concurrent updaters of pcp ->batch and ->high Because we are going to rely upon a careful transision between old and new ->high and ->batch values using memory barriers and will remove stop_machine(), we need to prevent multiple updaters from interweaving their memory writes. Add a simple mutex to protect both update loops. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d4bcc20ab6f0..8d4335779633 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,9 @@ #include #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -5557,6 +5560,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { for_each_possible_cpu(cpu) { unsigned long high; @@ -5565,6 +5570,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, per_cpu_ptr(zone->pageset, cpu), high); } } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -6078,7 +6084,9 @@ static int __meminit __zone_pcp_update(void *data) void __meminit zone_pcp_update(struct zone *zone) { + mutex_lock(&pcp_batch_high_lock); stop_machine(__zone_pcp_update, zone, NULL); + mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From e22996929e734caca649850f705ebc0f210eab93 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:12 +1000 Subject: mm/page_alloc: insert memory barriers to allow async update of pcp batch and high Introduce pageset_update() to perform a safe transision from one set of pcp->{batch,high} to a new set using memory barriers. This ensures that batch is always set to a safe value (1) prior to updating high, and ensure that high is fully updated before setting the real value of batch. It avoids ->batch ever rising above ->high. Suggested by Gilad Ben-Yossef in these threads: https://lkml.org/lkml/2013/4/9/23 https://lkml.org/lkml/2013/4/10/49 Also reproduces his proposed comment. Signed-off-by: Cody P Schafer Reviewed-by: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d4335779633..eaaef2a09424 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4035,12 +4035,37 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + /* a companion to setup_pagelist_highmark() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { - struct per_cpu_pages *pcp = &p->pcp; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) @@ -4064,13 +4089,11 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } static void __meminit setup_zone_pageset(struct zone *zone) -- cgit v1.2.3 From ca8d5a48de232f94b79b3b1513b4f7e61437c177 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:12 +1000 Subject: mm/page_alloc: protect pcp->batch accesses with ACCESS_ONCE pcp->batch could change at any point, avoid relying on it being a stable value. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaaef2a09424..97b8f861e63d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1182,10 +1182,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1353,8 +1355,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: -- cgit v1.2.3 From 6a73cbc3673d3bcc7890907588b299fa3c80b58d Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:12 +1000 Subject: mm/page_alloc: convert zone_pcp_update() to rely on memory barriers instead of stop_machine() zone_pcp_update()'s goal is to adjust the ->high and ->mark members of a percpu pageset based on a zone's ->managed_pages. We don't need to drain the entire percpu pageset just to modify these fields. This lets us avoid calling setup_pageset() (and the draining required to call it) and instead allows simply setting the fields' values (with some attention paid to memory barriers to prevent the relationship between ->batch and ->high from being thrown off). This does change the behavior of zone_pcp_update() as the percpu pagesets will not be drained when zone_pcp_update() is called (they will end up being shrunk, not completely drained, later when a 0-order page is freed in free_hot_cold_page()). Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97b8f861e63d..8125263be60f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6085,33 +6085,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { + unsigned cpu; + unsigned long batch; mutex_lock(&pcp_batch_high_lock); - stop_machine(__zone_pcp_update, zone, NULL); + batch = zone_batchsize(zone); + for_each_possible_cpu(cpu) + pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 0605ac73399d54a6f3a0469194456eab29cbe7ea Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:13 +1000 Subject: mm/page_alloc: when handling percpu_pagelist_fraction, don't unneedly recalulate high Simply moves calculation of the new 'high' value outside the for_each_possible_cpu() loop, as it does not depend on the cpu. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8125263be60f..386de0f11bea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,7 +5575,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5589,12 +5588,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + per_cpu_ptr(zone->pageset, cpu), high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3 From 82f4005f461066a1d3fb2510298f793ecbbe0800 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:13 +1000 Subject: mm/page_alloc: factor setup_pageset() into pageset_init() and pageset_set_batch() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 386de0f11bea..a235149d9406 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4071,7 +4071,7 @@ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4080,11 +4080,16 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. -- cgit v1.2.3 From 9bb0f8367c71d1df8b160d9201c1b7b18db76fba Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:13 +1000 Subject: mm/page_alloc: relocate comment to be directly above code it refers to. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a235149d9406..2793ce50f316 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3711,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } -- cgit v1.2.3 From 573f01a456f4fb62cf68bac063a29ba8bfddc155 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:14 +1000 Subject: mm/page_alloc: factor zone_pageset_init() out of setup_zone_pageset() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2793ce50f316..ee6fe7faabad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,22 +4104,25 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - } + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* -- cgit v1.2.3 From a527e01fb27eead6839bae1bc9bb39d74fdab49f Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:14 +1000 Subject: mm/page_alloc: in zone_pcp_update(), uze zone_pageset_init() Previously, zone_pcp_update() called pageset_set_batch() directly, essentially assuming that percpu_pagelist_fraction == 0. Correct this by calling zone_pageset_init(), which chooses the appropriate ->batch and ->high calculations. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ee6fe7faabad..c7344d17660b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,11 +6098,9 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) void __meminit zone_pcp_update(struct zone *zone) { unsigned cpu; - unsigned long batch; mutex_lock(&pcp_batch_high_lock); - batch = zone_batchsize(zone); for_each_possible_cpu(cpu) - pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); + zone_pageset_init(zone, cpu); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From b7ea1e6d88e559b692f52a1e3fe338aeb27f3775 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:14 +1000 Subject: mm/page_alloc: rename setup_pagelist_highmark() to match naming of pageset_set_batch() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7344d17660b..03a3f943d98e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4065,7 +4065,7 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, pcp->batch = batch; } -/* a companion to setup_pagelist_highmark() */ +/* a companion to pageset_set_high() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); @@ -4091,10 +4091,10 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) } /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { unsigned long batch = max(1UL, high / 4); @@ -4110,7 +4110,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_init(pcp); if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, + pageset_set_high(pcp, (zone->managed_pages / percpu_pagelist_fraction)); else @@ -5599,8 +5599,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, unsigned long high; high = zone->managed_pages / percpu_pagelist_fraction; for_each_possible_cpu(cpu) - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3 From fd518a26e031b68d30ed0d91ede8fcf6ed2f50de Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 23 May 2013 10:37:15 +1000 Subject: mm/THP: add pmd args to pgtable deposit and withdraw APIs This will be later used by powerpc THP support. In powerpc we want to use pgtable for storing the hash index values. So instead of adding them to mm_context list, we would like to store them in the second half of pmd Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrea Arcangeli Reviewed-by: David Gibson Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 5 +++-- arch/s390/mm/pgtable.c | 5 +++-- arch/sparc/include/asm/pgtable_64.h | 5 +++-- arch/sparc/mm/tlb.c | 5 +++-- include/asm-generic/pgtable.h | 5 +++-- mm/huge_memory.c | 18 +++++++++--------- mm/pgtable-generic.c | 5 +++-- 7 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c192e74250bf..e53ce161df74 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1355,10 +1355,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, , pmd_t *pmdp, + pgtable_t pgtable); #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); static inline int pmd_trans_splitting(pmd_t pmd) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 17bf4d3d303a..a8154a1a2c94 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1165,7 +1165,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { struct list_head *lh = (struct list_head *) pgtable; @@ -1179,7 +1180,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) mm->pmd_huge_pte = pgtable; } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { struct list_head *lh; pgtable_t pgtable; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 7619f2f792af..d22b92d67844 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -853,10 +853,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif /* Encode and de-code a swap entry */ diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 83d89bcb44af..f828dd33551c 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -188,7 +188,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { struct list_head *lh = (struct list_head *) pgtable; @@ -202,7 +203,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) mm->pmd_huge_pte = pgtable; } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { struct list_head *lh; pgtable_t pgtable; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 347fb7bfcbe2..185cf318080a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -173,11 +173,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4347eb3a00ff..76dc3bb52269 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -730,7 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma); page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -772,7 +772,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); mm->nr_ptes++; return true; } @@ -917,7 +917,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - pgtable_trans_huge_deposit(dst_mm, pgtable); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1360,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (is_huge_zero_pmd(orig_pmd)) { @@ -1693,7 +1693,7 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); haddr = address; @@ -2363,7 +2363,7 @@ static void collapse_huge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; @@ -2669,7 +2669,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c8323fe6c8f..e1a6e4fab016 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { assert_spin_locked(&mm->page_table_lock); @@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* no "address" argument so destroys page coloring of some arch */ -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; -- cgit v1.2.3 From 3193193b2accab9d82e972d28ee2956e6a223088 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 23 May 2013 10:37:15 +1000 Subject: mm/THP: withdraw the pgtable after pmdp related operations For architectures like ppc64 we look at deposited pgtable when calling pmdp_get_and_clear. So do the pgtable_trans_huge_withdraw after finishing pmdp related operations. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrea Arcangeli Cc: Andrea Arcangeli Cc: David Gibson Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76dc3bb52269..c4369924f4c4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1360,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); -- cgit v1.2.3 From e6b75a115f4136ace7d8c9d3e0c9750260181c06 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 23 May 2013 10:37:15 +1000 Subject: mm/THP: don't use HPAGE_SHIFT in transparent hugepage code For architectures like powerpc that support multiple explicit hugepage sizes, HPAGE_SHIFT indicate the default explicit hugepage shift. For THP to work the hugepage size should be same as PMD_SIZE. So use PMD_SHIFT directly. So move the define outside CONFIG_TRANSPARENT_HUGEPAGE #ifdef because we want to use these defines in generic code with if (pmd_trans_huge()) conditional. Signed-off-by: Aneesh Kumar K.V Cc: Andrea Arcangeli Cc: David Gibson Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 528454c2caa9..cc276d2c3a40 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -58,12 +58,11 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1< Date: Thu, 23 May 2013 10:37:16 +1000 Subject: mm/THP: deposit the transpare huge pgtable before set_pmd Architectures like powerpc use the deposited pgtable to store hash index values. We need to make the deposted pgtable is visible to other cpus before we are ready to take a hash fault. Signed-off-by: Aneesh Kumar K.V Cc: Andrea Arcangeli Cc: David Gibson Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c4369924f4c4..243e710c6039 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pmd_t entry; entry = mk_huge_pmd(page, vma); page_add_new_anon_rmap(page, vma, haddr); - set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); - set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); mm->nr_ptes++; return true; } @@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); - set_pmd_at(dst_mm, addr, dst_pmd, pmd); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); dst_mm->nr_ptes++; ret = 0; @@ -2367,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pmd, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; -- cgit v1.2.3 From d93e66a0cd91d39aacabd561d3055e1fda500781 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:16 +1000 Subject: mm: vmscan: limit the number of pages kswapd reclaims at each priority This series does not fix all the current known problems with reclaim but it addresses one important swapping bug when there is background IO. Changelog since V3 o Drop the slab shrink changes in light of Glaubers series and discussions highlighted that there were a number of potential problems with the patch. (mel) o Rebased to 3.10-rc1 Changelog since V2 o Preserve ratio properly for proportional scanning (kamezawa) Changelog since V1 o Rename ZONE_DIRTY to ZONE_TAIL_LRU_DIRTY (andi) o Reformat comment in shrink_page_list (andi) o Clarify some comments (dhillf) o Rework how the proportional scanning is preserved o Add PageReclaim check before kswapd starts writeback o Reset sc.nr_reclaimed on every full zone scan Kswapd and page reclaim behaviour has been screwy in one way or the other for a long time. Very broadly speaking it worked in the far past because machines were limited in memory so it did not have that many pages to scan and it stalled congestion_wait() frequently to prevent it going completely nuts. In recent times it has behaved very unsatisfactorily with some of the problems compounded by the removal of stall logic and the introduction of transparent hugepage support with high-order reclaims. There are many variations of bugs that are rooted in this area. One example is reports of a large copy operations or backup causing the machine to grind to a halt or applications pushed to swap. Sometimes in low memory situations a large percentage of memory suddenly gets reclaimed. In other cases an application starts and kswapd hits 100% CPU usage for prolonged periods of time and so on. There is now talk of introducing features like an extra free kbytes tunable to work around aspects of the problem instead of trying to deal with it. It's compounded by the problem that it can be very workload and machine specific. This series aims at addressing some of the worst of these problems without attempting to fundmentally alter how page reclaim works. Patches 1-2 limits the number of pages kswapd reclaims while still obeying the anon/file proportion of the LRUs it should be scanning. Patches 3-4 control how and when kswapd raises its scanning priority and deletes the scanning restart logic which is tricky to follow. Patch 5 notes that it is too easy for kswapd to reach priority 0 when scanning and then reclaim the world. Down with that sort of thing. Patch 6 notes that kswapd starts writeback based on scanning priority which is not necessarily related to dirty pages. It will have kswapd writeback pages if a number of unqueued dirty pages have been recently encountered at the tail of the LRU. Patch 7 notes that sometimes kswapd should stall waiting on IO to complete to reduce LRU churn and the likelihood that it'll reclaim young clean pages or push applications to swap. It will cause kswapd to block on IO if it detects that pages being reclaimed under writeback are recycling through the LRU before the IO completes. Patchies 8-9 are cosmetic but balance_pgdat() is easier to follow after they are applied. This was tested using memcached+memcachetest while some background IO was in progress as implemented by the parallel IO tests implement in MM Tests. memcachetest benchmarks how many operations/second memcached can service and it is run multiple times. It starts with no background IO and then re-runs the test with larger amounts of IO in the background to roughly simulate a large copy in progress. The expectation is that the IO should have little or no impact on memcachetest which is running entirely in memory. 3.10.0-rc1 3.10.0-rc1 vanilla lessdisrupt-v4 Ops memcachetest-0M 22155.00 ( 0.00%) 22180.00 ( 0.11%) Ops memcachetest-715M 22720.00 ( 0.00%) 22355.00 ( -1.61%) Ops memcachetest-2385M 3939.00 ( 0.00%) 23450.00 (495.33%) Ops memcachetest-4055M 3628.00 ( 0.00%) 24341.00 (570.92%) Ops io-duration-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops io-duration-715M 12.00 ( 0.00%) 7.00 ( 41.67%) Ops io-duration-2385M 118.00 ( 0.00%) 21.00 ( 82.20%) Ops io-duration-4055M 162.00 ( 0.00%) 36.00 ( 77.78%) Ops swaptotal-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-715M 140134.00 ( 0.00%) 18.00 ( 99.99%) Ops swaptotal-2385M 392438.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-4055M 449037.00 ( 0.00%) 27864.00 ( 93.79%) Ops swapin-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-715M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-2385M 148031.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-4055M 135109.00 ( 0.00%) 0.00 ( 0.00%) Ops minorfaults-0M 1529984.00 ( 0.00%) 1530235.00 ( -0.02%) Ops minorfaults-715M 1794168.00 ( 0.00%) 1613750.00 ( 10.06%) Ops minorfaults-2385M 1739813.00 ( 0.00%) 1609396.00 ( 7.50%) Ops minorfaults-4055M 1754460.00 ( 0.00%) 1614810.00 ( 7.96%) Ops majorfaults-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops majorfaults-715M 185.00 ( 0.00%) 180.00 ( 2.70%) Ops majorfaults-2385M 24472.00 ( 0.00%) 101.00 ( 99.59%) Ops majorfaults-4055M 22302.00 ( 0.00%) 229.00 ( 98.97%) Note how the vanilla kernels performance collapses when there is enough IO taking place in the background. This drop in performance is part of what users complain of when they start backups. Note how the swapin and major fault figures indicate that processes were being pushed to swap prematurely. With the series applied, there is no noticable performance drop and while there is still some swap activity, it's tiny. 20 iterations of this test were run in total and averaged. Every 5 iterations, additional IO was generated in the background using dd to measure how the workload was impacted. The 0M, 715M, 2385M and 4055M subblock refer to the amount of IO going on in the background at each iteration. So memcachetest-2385M is reporting how many transactions/second memcachetest recorded on average over 5 iterations while there was 2385M of IO going on in the ground. There are six blocks of information reported here memcachetest is the transactions/second reported by memcachetest. In the vanilla kernel note that performance drops from around 22K/sec to just under 4K/second when there is 2385M of IO going on in the background. This is one type of performance collapse users complain about if a large cp or backup starts in the background io-duration refers to how long it takes for the background IO to complete. It's showing that with the patched kernel that the IO completes faster while not interfering with the memcache workload swaptotal is the total amount of swap traffic. With the patched kernel, the total amount of swapping is much reduced although it is still not zero. swapin in this case is an indication as to whether we are swap trashing. The closer the swapin/swapout ratio is to 1, the worse the trashing is. Note with the patched kernel that there is no swapin activity indicating that all the pages swapped were really inactive unused pages. minorfaults are just minor faults. An increased number of minor faults can indicate that page reclaim is unmapping the pages but not swapping them out before they are faulted back in. With the patched kernel, there is only a small change in minor faults majorfaults are just major faults in the target workload and a high number can indicate that a workload is being prematurely swapped. With the patched kernel, major faults are much reduced. As there are no swapin's recorded so it's not being swapped. The likely explanation is that that libraries or configuration files used by the workload during startup get paged out by the background IO. Overall with the series applied, there is no noticable performance drop due to background IO and while there is still some swap activity, it's tiny and the lack of swapins imply that the swapped pages were inactive and unused. 3.10.0-rc1 3.10.0-rc1 vanilla lessdisrupt-v4 Page Ins 1234608 101892 Page Outs 12446272 11810468 Swap Ins 283406 0 Swap Outs 698469 27882 Direct pages scanned 0 136480 Kswapd pages scanned 6266537 5369364 Kswapd pages reclaimed 1088989 930832 Direct pages reclaimed 0 120901 Kswapd efficiency 17% 17% Kswapd velocity 5398.371 4635.115 Direct efficiency 100% 88% Direct velocity 0.000 117.817 Percentage direct scans 0% 2% Page writes by reclaim 1655843 4009929 Page writes file 957374 3982047 Page writes anon 698469 27882 Page reclaim immediate 5245 1745 Page rescued immediate 0 0 Slabs scanned 33664 25216 Direct inode steals 0 0 Kswapd inode steals 19409 778 Kswapd skipped wait 0 0 THP fault alloc 35 30 THP collapse alloc 472 401 THP splits 27 22 THP fault fallback 0 0 THP collapse fail 0 1 Compaction stalls 0 4 Compaction success 0 0 Compaction failures 0 4 Page migrate success 0 0 Page migrate failure 0 0 Compaction pages isolated 0 0 Compaction migrate scanned 0 0 Compaction free scanned 0 0 Compaction cost 0 0 NUMA PTE updates 0 0 NUMA hint faults 0 0 NUMA hint local faults 0 0 NUMA pages migrated 0 0 AutoNUMA cost 0 0 Unfortunately, note that there is a small amount of direct reclaim due to kswapd no longer reclaiming the world. ftrace indicates that the direct reclaim stalls are mostly harmless with the vast bulk of the stalls incurred by dd 23 tclsh-3367 38 memcachetest-13733 49 memcachetest-12443 57 tee-3368 1541 dd-13826 1981 dd-12539 A consequence of the direct reclaim for dd is that the processes for the IO workload may show a higher system CPU usage. There is also a risk that kswapd not reclaiming the world may mean that it stays awake balancing zones, does not stall on the appropriate events and continually scans pages it cannot reclaim consuming CPU. This will be visible as continued high CPU usage but in my own tests I only saw a single spike lasting less than a second and I did not observe any problems related to reclaim while running the series on my desktop. This patch: The number of pages kswapd can reclaim is bound by the number of pages it scans which is related to the size of the zone and the scanning priority. In many cases the priority remains low because it's reset every SWAP_CLUSTER_MAX reclaimed pages but in the event kswapd scans a large number of pages it cannot reclaim, it will raise the priority and potentially discard a large percentage of the zone as sc->nr_to_reclaim is ULONG_MAX. The user-visible effect is a reclaim "spike" where a large percentage of memory is suddenly freed. It would be bad enough if this was just unused memory but because of how anon/file pages are balanced it is possible that applications get pushed to swap unnecessarily. This patch limits the number of pages kswapd will reclaim to the high watermark. Reclaim will still overshoot due to it not being a hard limit as shrink_lruvec() will ignore the sc.nr_to_reclaim at DEF_PRIORITY but it prevents kswapd reclaiming the world at higher priorities. The number of pages it reclaims is not adjusted for high-order allocations as kswapd will reclaim excessively if it is to balance zones for high-order allocations. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index fa6a85378ee4..cdbc0699ea21 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2600,6 +2600,32 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, return pgdat_balanced(pgdat, order, classzone_idx); } +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + */ +static void kswapd_shrink_zone(struct zone *zone, + struct scan_control *sc, + unsigned long lru_pages) +{ + unsigned long nr_slab; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + shrink_zone(zone, sc); + + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -2627,24 +2653,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, .order = order, .target_mem_cgroup = NULL, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; loop_again: sc.priority = DEF_PRIORITY; sc.nr_reclaimed = 0; @@ -2716,7 +2733,7 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; + int testorder; unsigned long balance_gap; if (!populated_zone(zone)) @@ -2764,16 +2781,8 @@ loop_again: if ((buffer_heads_over_limit && is_highmem_idx(i)) || !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - shrink_zone(zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } + balance_gap, end_zone)) + kswapd_shrink_zone(zone, &sc, lru_pages); /* * If we're getting trouble reclaiming, start doing -- cgit v1.2.3 From 56410bd8e07b1f4a37d2b83af2865b0cfb211a37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:16 +1000 Subject: mm: vmscan: obey proportional scanning requirements for kswapd Simplistically, the anon and file LRU lists are scanned proportionally depending on the value of vm.swappiness although there are other factors taken into account by get_scan_count(). The patch "mm: vmscan: Limit the number of pages kswapd reclaims" limits the number of pages kswapd reclaims but it breaks this proportional scanning and may evenly shrink anon/file LRUs regardless of vm.swappiness. This patch preserves the proportional scanning and reclaim. It does mean that kswapd will reclaim more than requested but the number of pages will be related to the high watermark. [mhocko@suse.cz: Correct proportional reclaim for memcg and simplify] [kamezawa.hiroyu@jp.fujitsu.com: Recalculate scan based on target] [hannes@cmpxchg.org: Account for already scanned pages properly] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cdbc0699ea21..26ad67f1962c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1822,17 +1822,25 @@ out: static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; + bool scan_adjusted = false; get_scan_count(lruvec, sc, nr); + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); @@ -1842,17 +1850,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) lruvec, sc); } } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. + * For global direct reclaim, reclaim only the number of pages + * requested. Less care is taken to scan proportionally as it + * is more important to minimise direct reclaim stall latency + * than it is to properly age the LRU lists. */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) + if (global_reclaim(sc) && !current_is_kswapd()) break; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs shrink + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; -- cgit v1.2.3 From 7ec319447a477bd7d290ab9dd080b35e5b65b534 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:17 +1000 Subject: mm: vmscan: flatten kswapd priority loop kswapd stops raising the scanning priority when at least SWAP_CLUSTER_MAX pages have been reclaimed or the pgdat is considered balanced. It then rechecks if it needs to restart at DEF_PRIORITY and whether high-order reclaim needs to be reset. This is not wrong per-se but it is confusing to follow and forcing kswapd to stay at DEF_PRIORITY may require several restarts before it has scanned enough pages to meet the high watermark even at 100% efficiency. This patch irons out the logic a bit by controlling when priority is raised and removing the "goto loop_again". This patch has kswapd raise the scanning priority until it is scanning enough pages that it could meet the high watermark in one shrink of the LRU lists if it is able to reclaim at 100% efficiency. It will not raise the scanning prioirty higher unless it is failing to reclaim any pages. To avoid infinite looping for high-order allocation requests kswapd will not reclaim for high-order allocations when it has reclaimed at least twice the number of pages as the allocation request. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 86 +++++++++++++++++++++++++++++-------------------------------- 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 26ad67f1962c..1c10ee512215 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2654,8 +2654,12 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, /* * kswapd shrinks the zone by the number of pages required to reach * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim. This is used to determine if the scanning priority needs to be + * raised. */ -static void kswapd_shrink_zone(struct zone *zone, +static bool kswapd_shrink_zone(struct zone *zone, struct scan_control *sc, unsigned long lru_pages) { @@ -2675,6 +2679,8 @@ static void kswapd_shrink_zone(struct zone *zone, if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; + + return sc->nr_scanned >= sc->nr_to_reclaim; } /* @@ -2701,26 +2707,26 @@ static void kswapd_shrink_zone(struct zone *zone, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, + .may_writepage = !laptop_mode, .order = order, .target_mem_cgroup = NULL, }; -loop_again: - sc.priority = DEF_PRIORITY; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); do { unsigned long lru_pages = 0; + bool raise_priority = true; + + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2762,10 +2768,8 @@ loop_again: } } - if (i < 0) { - pgdat_is_balanced = true; + if (i < 0) goto out; - } for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2832,8 +2836,16 @@ loop_again: if ((buffer_heads_over_limit && is_highmem_idx(i)) || !zone_balanced(zone, testorder, - balance_gap, end_zone)) - kswapd_shrink_zone(zone, &sc, lru_pages); + balance_gap, end_zone)) { + /* + * There should be no need to raise the + * scanning priority if enough pages are + * already being scanned that high + * watermark would be met at 100% efficiency. + */ + if (kswapd_shrink_zone(zone, &sc, lru_pages)) + raise_priority = false; + } /* * If we're getting trouble reclaiming, start doing @@ -2868,46 +2880,29 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (pgdat_balanced(pgdat, order, *classzone_idx)) { - pgdat_is_balanced = true; - break; /* kswapd: all done */ - } - /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } while (--sc.priority >= 0); - -out: - if (!pgdat_is_balanced) { - cond_resched(); + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; - try_to_freeze(); + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) - order = sc.order = 0; - - goto loop_again; - } + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 0 && + !pgdat_balanced(pgdat, order, *classzone_idx)); /* * If kswapd was reclaiming at a higher order, it has the option of @@ -2936,6 +2931,7 @@ out: compact_pgdat(pgdat, order); } +out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, -- cgit v1.2.3 From 3defe36b60baeab8e5756a005d2696067b9462c9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:17 +1000 Subject: mm: vmscan: decide whether to compact the pgdat based on reclaim progress In the past, kswapd makes a decision on whether to compact memory after the pgdat was considered balanced. This more or less worked but it is late to make such a decision and does not fit well now that kswapd makes a decision whether to exit the zone scanning loop depending on reclaim progress. This patch will compact a pgdat if at least the requested number of pages were reclaimed from unbalanced zones for a given priority. If any zone is currently balanced, kswapd will not call compaction as it is expected the necessary pages are already available. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 59 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c10ee512215..cd0980393bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2661,7 +2661,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, struct scan_control *sc, - unsigned long lru_pages) + unsigned long lru_pages, + unsigned long *nr_attempted) { unsigned long nr_slab; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2677,6 +2678,9 @@ static bool kswapd_shrink_zone(struct zone *zone, nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); sc->nr_reclaimed += reclaim_state->reclaimed_slab; + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; @@ -2724,7 +2728,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, do { unsigned long lru_pages = 0; + unsigned long nr_attempted = 0; bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -2774,7 +2780,21 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* @@ -2843,7 +2863,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * already being scanned that high * watermark would be met at 100% efficiency. */ - if (kswapd_shrink_zone(zone, &sc, lru_pages)) + if (kswapd_shrink_zone(zone, &sc, lru_pages, + &nr_attempted)) raise_priority = false; } @@ -2895,6 +2916,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (try_to_freeze() || kthread_should_stop()) break; + /* + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted + */ + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) + compact_pgdat(pgdat, order); + /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages @@ -2904,33 +2932,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, } while (sc.priority >= 0 && !pgdat_balanced(pgdat, order, *classzone_idx)); - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - } - - if (zones_need_compaction) - compact_pgdat(pgdat, order); - } - out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() -- cgit v1.2.3 From bb222c199db19cd1fd4387ad44e80b78eee69eac Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:17 +1000 Subject: mm: vmscan: do not allow kswapd to scan at maximum priority Page reclaim at priority 0 will scan the entire LRU as priority 0 is considered to be a near OOM condition. Kswapd can reach priority 0 quite easily if it is encountering a large number of pages it cannot reclaim such as pages under writeback. When this happens, kswapd reclaims very aggressively even though there may be no real risk of allocation failure or OOM. This patch prevents kswapd reaching priority 0 and trying to reclaim the world. Direct reclaimers will still reach priority 0 in the event of an OOM situation. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cd0980393bac..1505c573719d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2929,7 +2929,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; - } while (sc.priority >= 0 && + } while (sc.priority >= 1 && !pgdat_balanced(pgdat, order, *classzone_idx)); out: -- cgit v1.2.3 From 52f6064672b030457ca38c8afbf0dce446d399fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:18 +1000 Subject: mm: vmscan: have kswapd writeback pages based on dirty pages encountered, not priority Currently kswapd queues dirty pages for writeback if scanning at an elevated priority but the priority kswapd scans at is not related to the number of unqueued dirty encountered. Since commit "mm: vmscan: Flatten kswapd priority loop", the priority is related to the size of the LRU and the zone watermark which is no indication as to whether kswapd should write pages or not. This patch tracks if an excessive number of unqueued dirty pages are being encountered at the end of the LRU. If so, it indicates that dirty pages are being recycled before flusher threads can clean them and flags the zone so that kswapd will start writing pages until the zone is balanced. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 +++++++++ mm/vmscan.c | 31 +++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b..2aaf72f7e345 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -495,6 +495,10 @@ typedef enum { ZONE_CONGESTED, /* zone has many dirty pages backed by * a congested BDI */ + ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found + * many dirty file pages at the tail + * of the LRU. + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -517,6 +521,11 @@ static inline int zone_is_reclaim_congested(const struct zone *zone) return test_bit(ZONE_CONGESTED, &zone->flags); } +static inline int zone_is_reclaim_dirty(const struct zone *zone) +{ + return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1505c573719d..d6c916d808ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -676,13 +676,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, unsigned long *ret_nr_writeback, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; @@ -808,14 +809,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { nr_dirty++; + if (!PageWriteback(page)) + nr_unqueued_dirty++; + /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && (!current_is_kswapd() || - sc->priority >= DEF_PRIORITY - 2)) { + !zone_is_reclaim_dirty(zone))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -960,7 +964,7 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); mem_cgroup_uncharge_end(); - *ret_nr_dirty += nr_dirty; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1373,6 +1377,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, (nr_taken >> (DEF_PRIORITY - sc->priority))) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + /* + * Similarly, if many dirty pages are encountered that are not + * currently being written then flag that kswapd should start + * writing back pages. + */ + if (global_reclaim(sc) && nr_dirty && + nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority))) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, @@ -2769,8 +2782,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, end_zone = i; break; } else { - /* If balanced, clear the congested flag */ + /* + * If balanced, clear the dirty and congested + * flags + */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } } @@ -2888,8 +2905,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * possible there are dirty pages backed by * congested BDIs but as pressure is relieved, * speculatively avoid congestion waits + * or writing pages from kswapd context. */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } /* -- cgit v1.2.3 From 61c1829aa14fb77853aa2625bf3e8412251b5d7d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:18 +1000 Subject: mm: vmscan: block kswapd if it is encountering pages under writeback Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: Mel Gorman Reviewed-by: Michal Hocko Acked-by: Rik van Riel Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 8 ++++++ mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 64 insertions(+), 22 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf72f7e345..fce64afba042 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -499,6 +499,9 @@ typedef enum { * many dirty file pages at the tail * of the LRU. */ + ZONE_WRITEBACK, /* reclaim scanning has recently found + * many pages under writeback + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone) return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); } +static inline int zone_is_reclaim_writeback(const struct zone *zone) +{ + return test_bit(ZONE_WRITEBACK, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index d6c916d808ba..911c9cd3d81a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -724,25 +724,51 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. In this case, wait on the IO to complete + * and then clear the ZONE_WRITEBACK flag to recheck if the + * condition exists. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * memcg doesn't have any dirty pages throttling so we - * could easily OOM just because too many pages are in - * writeback and there is nothing else to reclaim. - * - * Check __GFP_IO, certainly because a loop driver - * thread might enter reclaim, and deadlock if it waits - * on a page for which it is needed to do the write - * (loop masks off __GFP_IO|__GFP_FS for this reason); - * but more thought would probably show more reasons. - * - * Don't require __GFP_FS, since we're not going into - * the FS, just waiting on its writeback completion. - * Worryingly, ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so - * testing may_enter_fs here is liable to OOM on them. - */ - if (global_reclaim(sc) || + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + wait_on_page_writeback(page); + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* Case 2 above */ + } else if (global_reclaim(sc) || !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { /* * This is slightly racy - end_page_writeback() @@ -757,9 +783,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); } - wait_on_page_writeback(page); } if (!force_reclaim) @@ -1374,8 +1404,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * isolated page is PageWriteback */ if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + (nr_taken >> (DEF_PRIORITY - sc->priority))) { wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + zone_set_flag(zone, ZONE_WRITEBACK); + } /* * Similarly, if many dirty pages are encountered that are not @@ -2669,8 +2701,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, * the high watermark. * * Returns true if kswapd scanned at least the requested number of pages to - * reclaim. This is used to determine if the scanning priority needs to be - * raised. + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_zone(struct zone *zone, struct scan_control *sc, @@ -2697,6 +2729,8 @@ static bool kswapd_shrink_zone(struct zone *zone, if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; + zone_clear_flag(zone, ZONE_WRITEBACK); + return sc->nr_scanned >= sc->nr_to_reclaim; } -- cgit v1.2.3 From d407d1bdeee5b2436d507527481733a0ab515e2b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:18 +1000 Subject: mm-vmscan-block-kswapd-if-it-is-encountering-pages-under-writeback-fix Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: Mel Gorman Cc: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 911c9cd3d81a..45aee36d9b93 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -732,9 +732,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, * under writeback and this page is both under writeback and * PageReclaim then it indicates that pages are being queued * for IO but are being recycled through the LRU before the - * IO can complete. In this case, wait on the IO to complete - * and then clear the ZONE_WRITEBACK flag to recheck if the - * condition exists. + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * block for HZ/10 or until some IO completes then clear the + * ZONE_WRITEBACK flag to recheck if the condition exists. * * 2) Global reclaim encounters a page, memcg encounters a * page that is not marked for immediate reclaim or @@ -764,7 +766,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (current_is_kswapd() && PageReclaim(page) && zone_is_reclaim_writeback(zone)) { - wait_on_page_writeback(page); + congestion_wait(BLK_RW_ASYNC, HZ/10); zone_clear_flag(zone, ZONE_WRITEBACK); /* Case 2 above */ -- cgit v1.2.3 From 1904f51bdf33c5ee8845b8da03bb2a0444efaff9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:19 +1000 Subject: mm: vmscan: check if kswapd should writepage once per pgdat scan Currently kswapd checks if it should start writepage as it shrinks each zone without taking into consideration if the zone is balanced or not. This is not wrong as such but it does not make much sense either. This patch checks once per pgdat scan if kswapd should be writing pages. Signed-off-by: Mel Gorman Reviewed-by: Michal Hocko Acked-by: Rik van Riel Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 45aee36d9b93..c223af21d251 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2850,6 +2850,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pgdat_needs_compaction = false; } + /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. @@ -2921,13 +2928,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, raise_priority = false; } - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - if (zone->all_unreclaimable) { if (end_zone && end_zone == i) end_zone--; -- cgit v1.2.3 From 9a579c78a075d26dce7c9060f3100c47dc4ad1e0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 May 2013 10:37:19 +1000 Subject: mm: vmscan: move logic from balance_pgdat() to kswapd_shrink_zone() balance_pgdat() is very long and some of the logic can and should be internal to kswapd_shrink_zone(). Move it so the flow of balance_pgdat() is marginally easier to follow. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Michal Hocko Acked-by: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton --- mm/vmscan.c | 110 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 54 insertions(+), 56 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c223af21d251..b1b38adbcb5c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2707,18 +2707,53 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, struct scan_control *sc, unsigned long lru_pages, unsigned long *nr_attempted) { unsigned long nr_slab; + int testorder = sc->order; + unsigned long balance_gap; struct reclaim_state *reclaim_state = current->reclaim_state; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; + bool lowmem_pressure; /* Reclaim above the high watermark. */ sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + shrink_zone(zone, sc); reclaim_state->reclaimed_slab = 0; @@ -2733,6 +2768,18 @@ static bool kswapd_shrink_zone(struct zone *zone, zone_clear_flag(zone, ZONE_WRITEBACK); + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (!zone->all_unreclaimable && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + return sc->nr_scanned >= sc->nr_to_reclaim; } @@ -2868,8 +2915,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2890,61 +2935,14 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_reclaimed += nr_soft_reclaimed; /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - testorder = order; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - /* - * There should be no need to raise the - * scanning priority if enough pages are - * already being scanned that high - * watermark would be met at 100% efficiency. - */ - if (kswapd_shrink_zone(zone, &sc, lru_pages, - &nr_attempted)) - raise_priority = false; - } - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (zone_balanced(zone, testorder, 0, end_zone)) - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * speculatively avoid congestion waits - * or writing pages from kswapd context. - */ - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* -- cgit v1.2.3 From 4d42be87d74e2aa6802defd6db785316440dac8c Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:19 +1000 Subject: mm: fix comment referring to non-existent size_seqlock, change to span_seqlock Signed-off-by: Cody P Schafer Acked-by: David Rientjes Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fce64afba042..9ec7cffcae21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -733,7 +733,7 @@ typedef struct pglist_data { * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * - * Nests above zone->lock and zone->size_seqlock. + * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif -- cgit v1.2.3 From bf84dcefef39ec32b3c543844112e4884116c0dc Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:20 +1000 Subject: mmzone: note that node_size_lock should be manipulated via pgdat_resize_lock() Signed-off-by: Cody P Schafer Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9ec7cffcae21..e511f9429f1e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -733,6 +733,9 @@ typedef struct pglist_data { * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * + * pgdat_resize_lock() and pgdat_resize_unlock() are provided to + * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. + * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; -- cgit v1.2.3 From 08b509cb9cae2b867637d1db912a6e3dd22425fd Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:20 +1000 Subject: memory_hotplug: use pgdat_resize_lock() in online_pages() mmzone.h documents node_size_lock (which pgdat_resize_lock() locks) as follows: * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. [...] So actually hold it when we update node_present_pages in online_pages(). Signed-off-by: Cody P Schafer Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 081b4d654ed6..a870d8071a3c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -918,6 +918,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { + unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; @@ -996,7 +997,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + if (onlined_pages) { node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) -- cgit v1.2.3 From 3eb277581cc6872ce954a13caa8af240fab87478 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Thu, 23 May 2013 10:37:20 +1000 Subject: memory_hotplug: use pgdat_resize_lock() in __offline_pages() mmzone.h documents node_size_lock (which pgdat_resize_lock() locks) as follows: * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. [...] So actually hold it when we update node_present_pages in __offline_pages(). Signed-off-by: Cody P Schafer Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a870d8071a3c..330b2c982633 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1585,7 +1585,11 @@ repeat: /* removal success */ zone->managed_pages -= offlined_pages; zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + totalram_pages -= offlined_pages; init_per_zone_wmark_min(); -- cgit v1.2.3 From cacbf61ba5b83263af2cebd88e44e1fb9bb70c4c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:21 +1000 Subject: memory_hotplug-use-pgdat_resize_lock-in-__offline_pages-fix fix build Cc: Cody P Schafer Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 330b2c982633..bef133edbe49 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1492,6 +1492,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long pfn, nr_pages, expire; long offlined_pages; int ret, drain, retry_max, node; + unsigned long flags; struct zone *zone; struct memory_notify arg; -- cgit v1.2.3 From 3e8987f747e69d7af93a9f4e86201d452210b667 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:21 +1000 Subject: include/linux/mmzone.h: cleanups - implement zone_idx() in C to fix its references-args-twice macro bug - use zone_idx() in is_highmem() to remove large amounts of silly fluff. Cc: Lin Feng Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e511f9429f1e..a9296055976b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -835,7 +835,10 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +static inline enum zone_type zone_idx(struct zone *zone) +{ + return zone - zone->zone_pgdat->node_zones; +} static inline int populated_zone(struct zone *zone) { @@ -877,10 +880,10 @@ static inline int is_normal_idx(enum zone_type idx) static inline int is_highmem(struct zone *zone) { #ifdef CONFIG_HIGHMEM - int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; - return zone_off == ZONE_HIGHMEM * sizeof(*zone) || - (zone_off == ZONE_MOVABLE * sizeof(*zone) && - zone_movable_is_highmem()); + enum zone_type idx = zone_idx(zone); + + return idx == ZONE_HIGHMEM || + (idx == ZONE_MOVABLE && zone_movable_is_highmem()); #else return 0; #endif -- cgit v1.2.3 From c1fb124205e1f5a097b451667329938296e02594 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:21 +1000 Subject: include-linux-mmzoneh-cleanups-fix use zone_idx() some more, further simplify is_highmem() Cc: Lin Feng Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a9296055976b..131989a8f574 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -879,25 +879,18 @@ static inline int is_normal_idx(enum zone_type idx) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM - enum zone_type idx = zone_idx(zone); - - return idx == ZONE_HIGHMEM || - (idx == ZONE_MOVABLE && zone_movable_is_highmem()); -#else - return 0; -#endif + return is_highmem_idx(zone_idx(zone)); } static inline int is_normal(struct zone *zone) { - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; + return zone_idx(zone) == ZONE_NORMAL; } static inline int is_dma32(struct zone *zone) { #ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; + return zone_idx(zone) == ZONE_DMA32; #else return 0; #endif @@ -906,7 +899,7 @@ static inline int is_dma32(struct zone *zone) static inline int is_dma(struct zone *zone) { #ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; + return zone_idx(zone) == ZONE_DMA; #else return 0; #endif -- cgit v1.2.3 From 8e505b62d398fa161b18b2a4d147a1e3827d4fcd Mon Sep 17 00:00:00 2001 From: Mike Yoknis Date: Thu, 23 May 2013 10:37:22 +1000 Subject: mm: memmap_init_zone() performance improvement We have what we call an "architectural simulator". It is a computer program that pretends that it is a computer system. We use it to test the firmware before real hardware is available. We have booted Linux on our simulator. As you would expect it takes longer to boot on the simulator than it does on real hardware. With my patch - boot time 41 minutes Without patch - boot time 94 minutes These numbers do not scale linearly to real hardware. But indicate to me a place where Linux can be improved. memmap_init_zone() loops through every Page Frame Number (pfn), including pfn values that are within the gaps between existing memory sections. The unneeded looping will become a boot performance issue when machines configure larger memory ranges that will contain larger and more numerous gaps. The code will skip across invalid pfn values to reduce the number of loops executed. Signed-off-by: Mike Yoknis Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a3f943d98e..ed06c1dffc9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3934,8 +3934,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { + pfn = ALIGN(pfn + MAX_ORDER_NR_PAGES, + MAX_ORDER_NR_PAGES) - 1; continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; } -- cgit v1.2.3 From 2e99b590f62802888e9c2a22188ffc246e0334b3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 23 May 2013 10:37:22 +1000 Subject: drop_caches: add some documentation and info message I would like to resurrect Dave's patch. The last time it was posted was here https://lkml.org/lkml/2010/9/16/250 and there didn't seem to be any strong opposition. Kosaki was worried about possible excessive logging when somebody drops caches too often (but then he claimed he didn't have a strong opinion on that) but I would say opposite. If somebody does that then I would really like to know that from the log when supporting a system because it almost for sure means that there is something fishy going on. It is also worth mentioning that only root can write drop caches so this is not an flooding attack vector. I am bringing that up again because this can be really helpful when chasing strange performance issues which (surprise surprise) turn out to be related to artificially dropped caches done because the admin thinks this would help... I have just refreshed the original patch on top of the current mm tree but I could live with KERN_INFO as well if people think that KERN_NOTICE is too hysterical. : From: Dave Hansen : Date: Fri, 12 Oct 2012 14:30:54 +0200 : : There is plenty of anecdotal evidence and a load of blog posts : suggesting that using "drop_caches" periodically keeps your system : running in "tip top shape". Perhaps adding some kernel : documentation will increase the amount of accurate data on its use. : : If we are not shrinking caches effectively, then we have real bugs. : Using drop_caches will simply mask the bugs and make them harder : to find, but certainly does not fix them, nor is it an appropriate : "workaround" to limit the size of the caches. : : It's a great debugging tool, and is really handy for doing things : like repeatable benchmark runs. So, add a bit more documentation : about it, and add a little KERN_NOTICE. It should help developers : who are chasing down reclaim-related bugs. [mhocko@suse.cz: refreshed to current -mm tree] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Dave Hansen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Michal Hocko Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton --- Documentation/sysctl/vm.txt | 33 +++++++++++++++++++++++++++------ fs/drop_caches.c | 2 ++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9ed919..a5717c38834a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -169,18 +169,39 @@ Setting this to zero disables periodic writeback altogether. drop_caches -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. +Writing to this will cause the kernel to drop clean caches, as well as +reclaimable slab objects like dentries and inodes. Once dropped, their +memory becomes free. To free pagecache: echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: +To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: +To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +This is a non-destructive operation and will not free any dirty objects. +To increase the number of objects freed by this operation, the user may run +`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the +number of dirty objects on the system and create more candidates to be +dropped. + +This file is not a means to control the growth of the various kernel caches +(inodes, dentries, pagecache, etc...) These objects are automatically +reclaimed by the kernel when memory is needed elsewhere on the system. + +Use of this file can cause performance problems. Since it discards cached +objects, it may cost a significant amount of I/O and CPU to recreate the +dropped objects, especially if they were under heavy use. Because of this, +use outside of a testing or debugging environment is not recommended. + +You may see informational messages in your kernel log when this file is +used: + + cat (1234): dropped kernel caches: 3 + +These are informational only. They do not mean that anything is wrong +with your system. ============================================================== diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..f23d2a7ed438 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { + printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n", + current->comm, task_pid_nr(current), sysctl_drop_caches); if (sysctl_drop_caches & 1) iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) -- cgit v1.2.3 From ece28cfeb88d744420da684cfb7cff28e50298e7 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Thu, 23 May 2013 10:37:22 +1000 Subject: drivers/usb/gadget/amd5536udc.c: avoid calling dma_pool_create() with NULL dev Calling dma_pool_create() with dev==NULL will oops on a NUMA machine. Rather than changing dma_pool_create() we wish to disallow passing dev==NULL. This requires fixing up the small number of drivers which are passing in dev==NULL. Use &dev->pdev->dev instead of NULL. Signed-off-by: Xi Wang Cc: Felipe Balbi Cc: Greg KH Signed-off-by: Andrew Morton --- drivers/usb/gadget/amd5536udc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index f52dcfe8f545..24bd363ca351 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -3099,7 +3099,7 @@ static int init_dma_pools(struct udc *dev) } /* DMA setup */ - dev->data_requests = dma_pool_create("data_requests", NULL, + dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev, sizeof(struct udc_data_dma), 0, 0); if (!dev->data_requests) { DBG(dev, "can't get request data pool\n"); @@ -3111,7 +3111,7 @@ static int init_dma_pools(struct udc *dev) dev->ep[UDC_EP0IN_IX].dma = &dev->regs->ctl; /* dma desc for setup data */ - dev->stp_requests = dma_pool_create("setup requests", NULL, + dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev, sizeof(struct udc_stp_dma), 0, 0); if (!dev->stp_requests) { DBG(dev, "can't get stp request pool\n"); -- cgit v1.2.3 From 795b68f158b6e33915779baaa1a2dfab5f260cc3 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Thu, 23 May 2013 10:37:23 +1000 Subject: mm/dmapool.c: fix null dev in dma_pool_create() A few drivers invoke dma_pool_create() with a null dev. Note that dev is dereferenced in dev_to_node(dev), causing a null pointer dereference. A long term solution is to disallow null dev. Once the drivers are fixed, we can simplify the core code here. For now we add WARN_ON(!dev) to notify the driver maintainers and avoid the null pointer dereference. Signed-off-by: Xi Wang Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/dmapool.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf9..668f26316e2e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -132,6 +132,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; + int node; if (align == 0) { align = 1; @@ -156,7 +157,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, return NULL; } - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + node = WARN_ON(!dev) ? -1 : dev_to_node(dev); + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; -- cgit v1.2.3 From f1c8af9ee833cd8c796ab70900d04b3dff040cab Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 23 May 2013 10:37:23 +1000 Subject: memcg: debugging facility to access dangling memcgs If memcg is tracking anything other than plain user memory (swap, tcp buf mem, or slab memory), it is possible - and normal - that a reference will be held by the group after it is dead. Still, for developers, it would be extremely useful to be able to query about those states during debugging. This patch provides a debugging facility in the root memcg, so we can inspect which memcgs still have pending objects, and what is the cause of this state. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton --- Documentation/cgroups/memory.txt | 16 ++++ init/Kconfig | 17 ++++ mm/memcontrol.c | 192 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 218 insertions(+), 7 deletions(-) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 3aaf7870a93e..5403b8cb1d09 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -72,6 +72,7 @@ Brief summary of control files. memory.move_charge_at_immigrate # set/show controls of moving charges memory.oom_control # set/show oom controls. memory.numa_stat # show the number of memory usage per numa node + memory.dangling_memcgs # show debugging information about dangling groups memory.kmem.limit_in_bytes # set/show hard limit for kernel memory memory.kmem.usage_in_bytes # show current kernel memory allocation @@ -581,6 +582,21 @@ unevictable= N0= N1= ... And we have total = file + anon + unevictable. +5.7 dangling_memcgs + +This file will only be ever present in the root cgroup, if the option +CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory +consumed by it may not be immediately freed. This is because when some +extensions are used, such as swap or kernel memory, objects can outlive the +group and hold a reference to it. + +If this is the case, the dangling_memcgs file will show information about what +are the memcgs still alive, and which references are still preventing it to be +freed. There is nothing wrong with that, but it is very useful when debugging, +to know where this memory is being held. This is a developer-oriented debugging +facility only, and no guarantees of interface stability will be given. The file +is read-only, and has the sole purpose of displaying information. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. diff --git a/init/Kconfig b/init/Kconfig index 3909a39075e1..9790328a3806 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -940,6 +940,23 @@ config MEMCG_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. +config MEMCG_DEBUG_ASYNC_DESTROY + bool "Memory Resource Controller Debug assynchronous object destruction" + depends on MEMCG_KMEM || MEMCG_SWAP + default n + help + When a memcg is destroyed, the memory + consumed by it may not be immediately freed. This is because when some + extensions are used, such as swap or kernel memory, objects can + outlive the group and hold a reference to it. + + If this is the case, the dangling_memcgs file will show information + about what are the memcgs still alive, and which references are still + preventing it to be freed. There is nothing wrong with that, but it is + very useful when debugging, to know where this memory is being held. + This is a developer-oriented debugging facility only, and no + guarantees of interface stability will be given. + config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" depends on RESOURCE_COUNTERS && HUGETLB_PAGE diff --git a/mm/memcontrol.c b/mm/memcontrol.c index caff46388129..e34da3c07a85 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -321,14 +321,31 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; - /* For oom notifier event fd */ - struct list_head oom_notify; + union { + /* For oom notifier event fd */ + struct list_head oom_notify; + /* + * we can only trigger an oom event if the memcg is alive. + * so we will reuse this field to hook the memcg in the list + * of dead memcgs. + */ + struct list_head dead; + }; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; + union { + /* + * Should we move charges of a task when a task is moved into + * this mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + + /* + * We are no longer concerned about moving charges after memcg + * is dead. So we will fill this up with its name, to aid + * debugging. + */ + char *memcg_name; + }; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -382,6 +399,55 @@ static size_t memcg_size(void) nr_node_ids * sizeof(struct mem_cgroup_per_node); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static LIST_HEAD(dangling_memcgs); +static DEFINE_MUTEX(dangling_memcgs_mutex); + +static inline void memcg_dangling_free(struct mem_cgroup *memcg) +{ + mutex_lock(&dangling_memcgs_mutex); + list_del(&memcg->dead); + mutex_unlock(&dangling_memcgs_mutex); + free_pages((unsigned long)memcg->memcg_name, 0); +} + +static inline void memcg_dangling_add(struct mem_cgroup *memcg) +{ + /* + * cgroup.c will do page-sized allocations most of the time, + * so we'll just follow the pattern. Also, __get_free_pages + * is a better interface than kmalloc for us here, because + * we'd like this memory to be always billed to the root cgroup, + * not to the process removing the memcg. While kmalloc would + * require us to wrap it into memcg_stop/resume_kmem_account, + * with __get_free_pages we just don't pass the memcg flag. + */ + memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0); + + /* + * we will, in general, just ignore failures. No need to go crazy, + * being this just a debugging interface. It is nice to copy a memcg + * name over, but if we (unlikely) can't, just the address will do + */ + if (!memcg->memcg_name) + goto add_list; + + if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) { + free_pages((unsigned long)memcg->memcg_name, 0); + memcg->memcg_name = NULL; + } + +add_list: + INIT_LIST_HEAD(&memcg->dead); + mutex_lock(&dangling_memcgs_mutex); + list_add(&memcg->dead, &dangling_memcgs); + mutex_unlock(&dangling_memcgs_mutex); +} +#else +static inline void memcg_dangling_free(struct mem_cgroup *memcg) {} +static inline void memcg_dangling_add(struct mem_cgroup *memcg) {} +#endif + /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -5103,6 +5169,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static void +mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_SWAP + u64 kmem; + u64 memsw; + + /* + * kmem will also propagate here, so we are only interested in the + * difference. See comment in mem_cgroup_reparent_charges for details. + * + * We could save this value for later consumption by kmem reports, but + * there is not a lot of problem if the figures differ slightly. + */ + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem; + seq_printf(m, "\t%llu swap bytes\n", memsw); +#endif +} + + +static void +mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m) +{ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + struct tcp_memcontrol *tcp = &memcg->tcp_mem; + s64 tcp_socks; + u64 tcp_bytes; + + tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated); + tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + seq_printf(m, "\t%llu tcp bytes", tcp_bytes); + /* + * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print + * it! + */ + if (tcp_bytes || tcp_socks) + seq_printf(m, ", in %lld sockets", tcp_socks); + seq_printf(m, "\n"); + +#endif +} + +static void +mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_KMEM + u64 kmem; + struct memcg_cache_params *params; + + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + seq_printf(m, "\t%llu kmem bytes", kmem); + + /* list below may not be initialized, so not even try */ + if (!kmem) + return; + + seq_printf(m, " in caches"); + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + struct kmem_cache *s = memcg_params_to_cache(params); + + seq_printf(m, " %s", s->name); + } + mutex_unlock(&memcg->slab_caches_mutex); + seq_printf(m, "\n"); +#endif +} + +/* + * After a memcg is destroyed, it may still be kept around in memory. + * Currently, the two main reasons for it are swap entries, and kernel memory. + * Because they will be freed assynchronously, they will pin the memcg structure + * and its resources until the last reference goes away. + * + * This root-only file will show information about which users + */ +static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg; + + mutex_lock(&dangling_memcgs_mutex); + + list_for_each_entry(memcg, &dangling_memcgs, dead) { + if (memcg->memcg_name) + seq_printf(m, "%s:\n", memcg->memcg_name); + else + seq_printf(m, "%p (name lost):\n", memcg); + + mem_cgroup_dangling_swap(memcg, m); + mem_cgroup_dangling_tcp(memcg, m); + mem_cgroup_dangling_kmem(memcg, m); + } + + mutex_unlock(&dangling_memcgs_mutex); + return 0; +} +#endif + static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; @@ -6003,6 +6170,14 @@ static struct cftype mem_cgroup_files[] = { .read_seq_string = mem_cgroup_slabinfo_read, }, #endif +#endif + +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY + { + .name = "dangling_memcgs", + .read_seq_string = mem_cgroup_dangling_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, #endif { }, /* terminate */ }; @@ -6153,6 +6328,8 @@ static void free_work(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, work_freeing); + + memcg_dangling_free(memcg); __mem_cgroup_free(memcg); } @@ -6347,6 +6524,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) kmem_cgroup_destroy(memcg); + memcg_dangling_add(memcg); mem_cgroup_put(memcg); } -- cgit v1.2.3 From ab441f0fd6c0da30e0213e347450db7878a5527e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:23 +1000 Subject: memcg-debugging-facility-to-access-dangling-memcgs-fix fix up Kconfig text Cc: Glauber Costa Signed-off-by: Andrew Morton --- init/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 9790328a3806..9ddf124ee40d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -941,14 +941,14 @@ config MEMCG_KMEM will ever exhaust kernel resources alone. config MEMCG_DEBUG_ASYNC_DESTROY - bool "Memory Resource Controller Debug assynchronous object destruction" + bool "Memory Resource Controller Debug asynchronous object destruction" depends on MEMCG_KMEM || MEMCG_SWAP default n help - When a memcg is destroyed, the memory - consumed by it may not be immediately freed. This is because when some - extensions are used, such as swap or kernel memory, objects can - outlive the group and hold a reference to it. + When a memcg is destroyed, the memory consumed by it may not be + immediately freed. This is because when some extensions are used, such + as swap or kernel memory, objects can outlive the group and hold a + reference to it. If this is the case, the dangling_memcgs file will show information about what are the memcgs still alive, and which references are still -- cgit v1.2.3 From 797f51cf740d7397d5fa99caddb662cfe02534d0 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Thu, 23 May 2013 10:37:24 +1000 Subject: mm: add vm event counters for balloon pages compaction Introduce a new set of vm event counters to keep track of ballooned pages compaction activity. Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/balloon_compaction.h | 7 +++++++ include/linux/vm_event_item.h | 7 ++++++- mm/balloon_compaction.c | 2 ++ mm/migrate.c | 1 + mm/vmstat.c | 9 ++++++++- 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f7f1d7169b11..6fd5cc80f62f 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -213,8 +213,15 @@ static inline bool balloon_compaction_check(void) return true; } +static inline void balloon_event_count(enum vm_event_item item) +{ + count_vm_event(item); +} #else /* !CONFIG_BALLOON_COMPACTION */ +/* A macro, to avoid generating references to the undefined COMPACTBALLOON* */ +#define balloon_event_count(item) do { } while (0) + static inline void *balloon_mapping_alloc(void *balloon_device, const struct address_space_operations *a_ops) { diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..d4b7a184f08c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,7 +50,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, -#endif +#ifdef CONFIG_BALLOON_COMPACTION + COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */ + COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */ + COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */ +#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..2c8ce496804f 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -242,6 +242,7 @@ bool balloon_page_isolate(struct page *page) if (__is_movable_balloon_page(page) && page_count(page) == 2) { __isolate_balloon_page(page); + balloon_event_count(COMPACTBALLOONISOLATED); unlock_page(page); return true; } @@ -265,6 +266,7 @@ void balloon_page_putback(struct page *page) __putback_balloon_page(page); /* drop the extra ref count taken for page isolation */ put_page(page); + balloon_event_count(COMPACTBALLOONRETURNED); } else { WARN_ON(1); dump_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index b1f57501de9c..0f348946e288 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -876,6 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); balloon_page_free(page); + balloon_event_count(COMPACTBALLOONMIGRATED); return MIGRATEPAGE_SUCCESS; } out: diff --git a/mm/vmstat.c b/mm/vmstat.c index f42745e65780..7a35116462a7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -794,7 +794,14 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", -#endif + +#ifdef CONFIG_BALLOON_COMPACTION + "compact_balloon_isolated", + "compact_balloon_migrated", + "compact_balloon_returned", +#endif /* CONFIG_BALLOON_COMPACTION */ + +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", -- cgit v1.2.3 From c739cbb102ae981e90975370a92f399e01a8008e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 May 2013 10:37:24 +1000 Subject: err.h: IS_ERR() can accept __user pointers Sparse generates a false positive when you pass a __user or __iomem pointer to the IS_ERR() functions. drivers/rtc/rtc-ds1286.c:344:36: sparse: incorrect type in argument 1 (different address spaces) drivers/rtc/rtc-ds1286.c:344:36: expected void const *ptr drivers/rtc/rtc-ds1286.c:344:36: got unsigned int [noderef] [usertype] *rtcregs We can silence these by adding a __force here and upgrading to the latest git release of Sparse. This change has no effect when using current Sparse releases. Signed-off-by: Dan Carpenter Acked-by: Christopher Li Signed-off-by: Andrew Morton --- include/linux/err.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/err.h b/include/linux/err.h index f2edce25a76b..221fcfb676c4 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -24,17 +24,17 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } -static inline long __must_check PTR_ERR(const void *ptr) +static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } -static inline long __must_check IS_ERR(const void *ptr) +static inline long __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } -static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); } @@ -46,13 +46,13 @@ static inline long __must_check IS_ERR_OR_NULL(const void *ptr) * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ -static inline void * __must_check ERR_CAST(const void *ptr) +static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } -static inline int __must_check PTR_RET(const void *ptr) +static inline int __must_check PTR_RET(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); -- cgit v1.2.3 From f1bffc8200cb5b3d7648c1c124e55cab59184042 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Thu, 23 May 2013 10:37:24 +1000 Subject: dump_stack: serialize the output from dump_stack() tAdd adds functionality to serialize the output from dump_stack() to avoid mangling of the output when dump_stack is called simultaneously from multiple cpus. Signed-off-by: Alex Thorlton Reported-by: Russ Anderson Reviewed-by: Robin Holt Cc: Vineet Gupta Cc: David S. Miller Cc: Richard Kuo Cc: Jesper Nilsson Signed-off-by: Andrew Morton --- lib/dump_stack.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 53bad099ebd6..710884b6a989 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -6,15 +6,62 @@ #include #include #include +#include +#include + +#ifdef CONFIG_SMP +#include +#endif /** * dump_stack - dump the current task information and its stack trace * * Architectures can override this implementation by implementing its own. */ +#ifdef CONFIG_SMP +static atomic_t dump_lock = ATOMIC_INIT(-1); + void dump_stack(void) +{ + int was_locked; + int old; + int cpu; + + /* + * Permit this cpu to perform nested stack dumps while serialising + * against other CPUs + */ + preempt_disable(); + +retry: + cpu = smp_processor_id(); + old = atomic_cmpxchg(&dump_lock, -1, cpu); + if (old == -1) { + was_locked = 0; + } else if (old == cpu) { + was_locked = 1; + } else { + cpu_relax(); + goto retry; + } + + __dump_stack(); + + if (!was_locked) + atomic_set(&dump_lock, -1); + + preempt_enable(); +} +#else +void dump_stack(void) +{ + __dump_stack(); +} +#endif +EXPORT_SYMBOL(dump_stack); + +static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); show_stack(NULL, NULL); } -EXPORT_SYMBOL(dump_stack); -- cgit v1.2.3 From a6d47090ca1d6071867de21783eeb28c01112631 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:25 +1000 Subject: dump_stack-serialize-the-output-from-dump_stack-fix - fix comment indenting - avoid inclusion of files - use where possible - fix uniprocessor build (__dump_stack undefined) - remove unneeded ifdef around smp.h inclusion Cc: Alex Thorlton Cc: David S. Miller Cc: Jesper Nilsson Cc: Richard Kuo Cc: Robin Holt Cc: Russ Anderson Cc: Vineet Gupta Cc: athorlton@sgi.com Signed-off-by: Andrew Morton --- lib/dump_stack.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 710884b6a989..c03154173cc7 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -6,12 +6,14 @@ #include #include #include -#include -#include - -#ifdef CONFIG_SMP #include -#endif +#include + +static void __dump_stack(void) +{ + dump_stack_print_info(KERN_DEFAULT); + show_stack(NULL, NULL); +} /** * dump_stack - dump the current task information and its stack trace @@ -27,10 +29,10 @@ void dump_stack(void) int old; int cpu; - /* - * Permit this cpu to perform nested stack dumps while serialising - * against other CPUs - */ + /* + * Permit this cpu to perform nested stack dumps while serialising + * against other CPUs + */ preempt_disable(); retry: @@ -59,9 +61,3 @@ void dump_stack(void) } #endif EXPORT_SYMBOL(dump_stack); - -static void __dump_stack(void) -{ - dump_stack_print_info(KERN_DEFAULT); - show_stack(NULL, NULL); -} -- cgit v1.2.3 From f971b812b9960898554154c2e31feff4c762ef2e Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Thu, 23 May 2013 10:37:25 +1000 Subject: panic: add cpu/pid to warn_slowpath_common in WARNING printk()s Add the cpu/pid that called WARN() so that the stack traces can be matched up with the WARNING messages. Signed-off-by: Alex Thorlton Reviewed-by: Robin Holt Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton --- kernel/panic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..aaa0b7c7f51b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -399,8 +399,9 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", ' + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); -- cgit v1.2.3 From a1528bb7490bc33bf2da8e4c4cc51ed62893ae8e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:25 +1000 Subject: panic-add-cpu-pid-to-warn_slowpath_common-in-warning-printks-fix remove stray quote Cc: Alex Thorlton Signed-off-by: Andrew Morton --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index aaa0b7c7f51b..97712319f128 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -400,7 +400,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", ' + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", raw_smp_processor_id(), current->pid, file, line, caller); if (args) -- cgit v1.2.3 From 6fed324471e282bc677322df4441f7e0b05357bf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 23 May 2013 10:37:26 +1000 Subject: smp: Give WARN()ing when calling smp_call_function_many()/single() in serving irq Currently the functions smp_call_function_many()/single() will give a WARN()ing only in the case of irqs_disabled(), but that check is not enough to guarantee execution of the SMP cross-calls. In many other cases such as softirq handling/interrupt handling, the two APIs still can not be called, just as the smp_call_function_many() comments say: * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. There is a real case for softirq DEADLOCK case: CPUA CPUB spin_lock(&spinlock) Any irq coming, call the irq handler irq_exit() spin_lock_irq(&spinlock) <== Blocking here due to CPUB hold it __do_softirq() run_timer_softirq() timer_cb() call smp_call_function_many() send IPI interrupt to CPUA wait_csd() Then both CPUA and CPUB will be deadlocked here. So we should give a warning in the nmi, hardirq or softirq context as well. Moreover, adding one new macro in_serving_irq() which indicates we are processing nmi, hardirq or sofirq. Signed-off-by: liu chuansheng Cc: Ingo Molnar Cc: Peter Zijlstra Tested-by: Fengguang Wu Cc: Lai Jiangshan Signed-off-by: Andrew Morton --- include/linux/hardirq.h | 5 +++++ kernel/smp.c | 11 +++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 05bcc0903766..ecc8e01f492a 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -94,6 +94,11 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) +/* + * Are we in nmi,irq context, or softirq context? + */ +#define in_serving_irq() (in_nmi() || in_irq() || in_serving_softirq()) + #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..97084cfc61ca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "smpboot.h" @@ -240,8 +241,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -378,8 +380,9 @@ void smp_call_function_many(const struct cpumask *mask, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress && !early_boot_irqs_disabled); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress && !early_boot_irqs_disabled); /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); -- cgit v1.2.3 From 78f0e597b13eb0315555948065a20415ce669fdc Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:26 +1000 Subject: backlight: atmel-pwm-bl: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d06310 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/video/backlight/atmel-pwm-bl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/video/backlight/atmel-pwm-bl.c b/drivers/video/backlight/atmel-pwm-bl.c index a60d6afca97c..0393d827dd44 100644 --- a/drivers/video/backlight/atmel-pwm-bl.c +++ b/drivers/video/backlight/atmel-pwm-bl.c @@ -195,7 +195,6 @@ static int __init atmel_pwm_bl_probe(struct platform_device *pdev) return 0; err_free_bl_dev: - platform_set_drvdata(pdev, NULL); backlight_device_unregister(bldev); err_free_pwm: pwm_channel_free(&pwmbl->pwmc); @@ -212,7 +211,6 @@ static int __exit atmel_pwm_bl_remove(struct platform_device *pdev) pwm_channel_disable(&pwmbl->pwmc); pwm_channel_free(&pwmbl->pwmc); backlight_device_unregister(pwmbl->bldev); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From 6a2b1d30ce1f5cdfc72c65c4a008d38a8b1c3fb6 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:27 +1000 Subject: backlight: ep93xx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/video/backlight/ep93xx_bl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index 33455821dd31..018368ba4124 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -111,7 +111,6 @@ static int ep93xxbl_remove(struct platform_device *dev) struct backlight_device *bl = platform_get_drvdata(dev); backlight_device_unregister(bl); - platform_set_drvdata(dev, NULL); return 0; } -- cgit v1.2.3 From 9ab315d94a4b75bc6f5066b742b6313c2c075105 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:27 +1000 Subject: backlight: lp8788: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/video/backlight/lp8788_bl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/backlight/lp8788_bl.c b/drivers/video/backlight/lp8788_bl.c index 4bb8b4f140cf..980855ec9bb1 100644 --- a/drivers/video/backlight/lp8788_bl.c +++ b/drivers/video/backlight/lp8788_bl.c @@ -312,7 +312,6 @@ static int lp8788_backlight_remove(struct platform_device *pdev) backlight_update_status(bl_dev); sysfs_remove_group(&pdev->dev.kobj, &lp8788_attr_group); lp8788_backlight_unregister(bl); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From b69097318af44c1389fbc57fe0bc321d5fdeacac Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:27 +1000 Subject: backlight: pcf50633: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/video/backlight/pcf50633-backlight.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/video/backlight/pcf50633-backlight.c b/drivers/video/backlight/pcf50633-backlight.c index e87c7a3394f3..6ed76be18f19 100644 --- a/drivers/video/backlight/pcf50633-backlight.c +++ b/drivers/video/backlight/pcf50633-backlight.c @@ -153,8 +153,6 @@ static int pcf50633_bl_remove(struct platform_device *pdev) backlight_device_unregister(pcf_bl->bl); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 9b54d680491403bbc94d05d068d92020d49fd9a1 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 23 May 2013 10:37:28 +1000 Subject: rbtree: remove unneeded include Remove an unnecessary include to kernel.h Signed-off-by: Nathan Zimmer Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 0022c1bb1e26..d2fdc6d1bdd2 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -29,7 +29,6 @@ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H -#include #include struct rb_node { -- cgit v1.2.3 From 3764319741be9e41c504dae71f66126d6a3358a2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:37:29 +1000 Subject: rbtree-remove-unneeded-include-fix fix lib/interval_tree.c build Cc: Dan Carpenter Cc: Nathan Zimmer Signed-off-by: Andrew Morton --- lib/interval_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/interval_tree.c b/lib/interval_tree.c index e6eb406f2d65..1efe7ab86dc1 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,3 +1,4 @@ +#include #include #include #include -- cgit v1.2.3 From bb0946f98890910638b234c7de5ba9b40e958030 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 23 May 2013 10:37:29 +1000 Subject: checkpatch: change CamelCase test and make it --strict Do not bleat a message on nominally acceptable CamelCase uses that are separated by an _ like drm_core_has_MTRR. CamelCase tests are also a bit noisy against certain types of code acceptable to some kernel developers. Make the test applicable only with --strict. Signed-off-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b954de58304f..f1aad19b475c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2938,12 +2938,12 @@ sub process { while ($line =~ m{($Constant|$Lval)}g) { my $var = $1; if ($var !~ /$Constant/ && - $var =~ /[A-Z]\w*[a-z]|[a-z]\w*[A-Z]/ && + $var =~ /[A-Z][a-z]|[a-z][A-Z]/ && $var !~ /"^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && !defined $camelcase{$var}) { $camelcase{$var} = 1; - WARN("CAMELCASE", - "Avoid CamelCase: <$var>\n" . $herecurr); + CHK("CAMELCASE", + "Avoid CamelCase: <$var>\n" . $herecurr); } } -- cgit v1.2.3 From 5a9f0a0451523f58fdbb2481d68f649d34d79ea4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 23 May 2013 10:37:29 +1000 Subject: checkpatch: warn when using gcc's binary constant ("0b") extension The gcc extension for binary constants that start with 0b is only supported with gcc version 4.3 or higher. The kernel can still be compiled with earlier versions of gcc, so have checkpatch emit a warning for these constants. Restructure checkpatch's constant finding code a bit to support finding these binary constants. Signed-off-by: Joe Perches Suggested-by: Andrew Morton Cc: Andy Whitcroft Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f1aad19b475c..517da26d9a2d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -230,11 +230,15 @@ our $Inline = qr{inline|__always_inline|noinline}; our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; +our $Int_type = qr{(?i)llu|ull|ll|lu|ul|l|u}; +our $Binary = qr{(?i)0b[01]+$Int_type?}; +our $Hex = qr{(?i)0x[0-9a-f]+$Int_type?}; +our $Int = qr{[0-9]+$Int_type?}; our $Float_hex = qr{(?i)0x[0-9a-f]+p-?[0-9]+[fl]?}; our $Float_dec = qr{(?i)(?:[0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(?:e-?[0-9]+)?[fl]?}; our $Float_int = qr{(?i)[0-9]+e-?[0-9]+[fl]?}; our $Float = qr{$Float_hex|$Float_dec|$Float_int}; -our $Constant = qr{$Float|(?i)(?:0x[0-9a-f]+|[0-9]+)[ul]*}; +our $Constant = qr{$Float|$Binary|$Hex|$Int}; our $Assignment = qr{\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ @@ -2934,9 +2938,17 @@ sub process { } } -#CamelCase +#Specific variable tests while ($line =~ m{($Constant|$Lval)}g) { my $var = $1; + +#gcc binary extension + if ($var =~ /^$Binary$/) { + WARN("GCC_BINARY_CONSTANT", + "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr); + } + +#CamelCase if ($var !~ /$Constant/ && $var =~ /[A-Z][a-z]|[a-z][A-Z]/ && $var !~ /"^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && -- cgit v1.2.3 From 2321967cf97a44e8ee2bd68808a0e066fe074583 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Thu, 23 May 2013 10:37:30 +1000 Subject: binfmt_elf.c: use get_random_int() to fix entropy depleting Entropy is quickly depleted under normal operations like ls(1), cat(1), etc... between 2.6.30 to current mainline, for instance: $ cat /proc/sys/kernel/random/entropy_avail 3428 $ cat /proc/sys/kernel/random/entropy_avail 2911 $cat /proc/sys/kernel/random/entropy_avail 2620 We observed this problem has been occurring since 2.6.30 with fs/binfmt_elf.c: create_elf_tables()->get_random_bytes(), introduced by f06295b44c296c8f ("ELF: implement AT_RANDOM for glibc PRNG seeding"). /* * Generate 16 random bytes for userspace PRNG seeding. */ get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); The patch introduces a wrapper around get_random_int() which has lower overhead than calling get_random_bytes() directly. With this patch applied: $ cat /proc/sys/kernel/random/entropy_avail 2731 $ cat /proc/sys/kernel/random/entropy_avail 2802 $ cat /proc/sys/kernel/random/entropy_avail 2878 Analyzed by John Sobecki. Signed-off-by: Jie Liu Cc: Andrew Morton Cc: Al Viro Cc: Andreas Dilger Cc: Alan Cox Cc: Arnd Bergmann Cc: John Sobecki Cc: James Morris Cc: Jakub Jelinek Cc: Ted Ts'o Cc: Greg Kroah-Hartman Acked-by: Kees Cook Cc: Ulrich Drepper Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 100edcc5e312..53b6d624c7a9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss) #define ELF_BASE_PLATFORM NULL #endif +/* + * Use get_random_int() to implement AT_RANDOM while avoiding depletion + * of the entropy pool. + */ +static void get_atrandom_bytes(unsigned char *buf, size_t nbytes) +{ + unsigned char *p = buf; + + while (nbytes) { + unsigned int random_variable; + size_t chunk = min(nbytes, sizeof(random_variable)); + + random_variable = get_random_int(); + memcpy(p, &random_variable, chunk); + p += chunk; + nbytes -= chunk; + } +} + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, /* * Generate 16 random bytes for userspace PRNG seeding. */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) -- cgit v1.2.3 From 58fc11e711e1c2cc48d6ac73f81889cae96b9c87 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 May 2013 10:37:30 +1000 Subject: init: remove permanent string buffer from do_one_initcall() do_one_initcall() uses a 64 byte string buffer to save a message. This buffer is declared static and is only used at boot up and when a module is loaded. As 64 bytes is very small, and this function has very limited scope, there's no reason to waste permanent memory with this string and not just simply put it on the stack. Signed-off-by: Steven Rostedt Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton --- init/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 9484f4ba88d0..1b9e7f1b8008 100644 --- a/init/main.c +++ b/init/main.c @@ -655,8 +655,6 @@ static void __init do_ctors(void) bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); -static char msgbuf[64]; - static int __init_or_module do_one_initcall_debug(initcall_t fn) { ktime_t calltime, delta, rettime; @@ -679,6 +677,7 @@ int __init_or_module do_one_initcall(initcall_t fn) { int count = preempt_count(); int ret; + char msgbuf[64]; if (initcall_debug) ret = do_one_initcall_debug(fn); -- cgit v1.2.3 From 9bffe2e91fedaa6a78ba377a49e93c603ac722b9 Mon Sep 17 00:00:00 2001 From: Toralf Förster Date: Thu, 23 May 2013 10:37:30 +1000 Subject: insert missing space in printk line of root_delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trivial, but it really looks better. Signed-off-by: Toralf Förster Signed-off-by: Andrew Morton --- init/do_mounts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 33d060e91fcd..c84fedb12c24 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -536,7 +536,7 @@ void __init prepare_namespace(void) int is_floppy; if (root_delay) { - printk(KERN_INFO "Waiting %dsec before mounting root device...\n", + printk(KERN_INFO "Waiting %d sec before mounting root device...\n", root_delay); ssleep(root_delay); } -- cgit v1.2.3 From ac9da5aa4615bc0d5a499f3a62f48dc814620c57 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 23 May 2013 10:37:31 +1000 Subject: autofs4: allow autofs to work outside the initial PID namespace Enable autofs4 to work in a "container". oz_pgrp is converted from pid_t to struct pid and this is stored at mount time based on the "pgrp=" option or if the option is missing then the current pgrp. The "pgrp=" option is interpreted in the PID namespace of the current process. This option is flawed in that it doesn't carry the namespace information, so it should be deprecated. AFAICS the autofs daemon always sends the current pgrp, which is the default anyway. The oz_pgrp is also set from the AUTOFS_DEV_IOCTL_SETPIPEFD_CMD ioctl. This ioctl sets oz_pgrp to the current pgrp. It is not allowed to change the pid namespace. oz_pgrp is used mainly to determine whether the process traversing the autofs mount tree is the autofs daemon itself or not. This function now compares the pid pointers instead of the pid_t values. One other use of oz_pgrp is in autofs4_show_options. There is shows the virtual pid number (i.e. the one that is valid inside the PID namespace of the calling process) For debugging printk convert oz_pgrp to the value in the initial pid namespace. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Miklos Szeredi Acked-by: Serge Hallyn Cc: Eric Biederman Cc: Ian Kent Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- fs/autofs4/autofs_i.h | 4 ++-- fs/autofs4/dev-ioctl.c | 16 ++++++++++++++-- fs/autofs4/inode.c | 33 +++++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 3f1128b37e46..16d3288c808d 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -104,7 +104,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -139,7 +139,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 743c7c2c949d..91838211b66d 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, { int pipefd; int err = 0; + struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; @@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { - struct file *pipe = fget(pipefd); + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + AUTOFS_WARN("Not allowed to change PID namespace"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; @@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, fput(pipe); goto out; } - sbi->oz_pgrp = task_pgrp_nr(current); + swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; } out: + put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index b104726e2d0a..1b045ecfcea2 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -62,6 +62,8 @@ void autofs4_kill_sb(struct super_block *sb) /* Free wait queues, close pipe */ autofs4_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + sb->s_fs_info = NULL; kfree(sbi); @@ -85,7 +87,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -129,7 +131,8 @@ static const match_table_t tokens = { }; static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -137,7 +140,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -176,6 +178,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, if (match_int(args, &option)) return 1; *pgrp = option; + *pgrp_set = true; break; case Opt_minproto: if (match_int(args, &option)) @@ -211,6 +214,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + int pgrp; + bool pgrp_set = false; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -223,7 +228,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); + sbi->oz_pgrp = NULL; sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; @@ -260,12 +265,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_warn("autofs: could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); @@ -289,9 +305,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); - + if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; @@ -321,6 +337,7 @@ fail_dput: fail_ino: kfree(ino); fail_free: + put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; fail_unlock: -- cgit v1.2.3 From b5df5210f5406eb292c9fe2a075772451aa812a3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 23 May 2013 10:37:31 +1000 Subject: autofs4: translate pids to the right namespace for the daemon The PID and the TGID of the process triggering the mount are sent to the daemon. Currently the global pid values are sent (ones valid in the initial pid namespace) but this is wrong if the autofs daemon itself is not running in the initial pid namespace. So send the pid values that are valid in the namespace of the autofs daemon. The namespace to use is taken from the oz_pgrp pid pointer, which was set at mount time to the mounting process' pid namespace. If the pid translation fails (the triggering process is in an unrelated pid namespace) then the automount fails with ENOENT. Signed-off-by: Miklos Szeredi Acked-by: Serge Hallyn Cc: Eric Biederman Cc: Ian Kent Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- fs/autofs4/waitq.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 3db70dae40d3..309ca6bcbb09 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -353,11 +353,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, struct qstr qstr; char *name; int status, ret, type; + pid_t pid; + pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) return -ENOENT; + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + if (!dentry->d_inode) { /* * A wait for a negative dentry is invalid for certain @@ -423,8 +435,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->ino = autofs4_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); - wq->pid = current->pid; - wq->tgid = current->tgid; + wq->pid = pid; + wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; mutex_unlock(&sbi->wq_mutex); -- cgit v1.2.3 From cc9a85ad0d71c47822ef1591479d2a598c9a15a2 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:31 +1000 Subject: rtc: rtc-88pm80x: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-88pm80x.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c index f3742f364eb8..354c937a5866 100644 --- a/drivers/rtc/rtc-88pm80x.c +++ b/drivers/rtc/rtc-88pm80x.c @@ -345,7 +345,6 @@ out: static int pm80x_rtc_remove(struct platform_device *pdev) { struct pm80x_rtc_info *info = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); pm80x_free_irq(info->chip, info->irq, info); return 0; } -- cgit v1.2.3 From 9986f288f16e7735dc8e81e2653a679bcb4f94e7 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:32 +1000 Subject: drivers/rtc/rtc-v3020.c: remove redundant goto Remove a redundant goto statement left over during the conversion of this driver to use devm_* APIs. Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-v3020.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 6e0cba8f47d5..4ee0e63ebba7 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -320,7 +320,7 @@ static int rtc_probe(struct platform_device *pdev) retval = chip->ops->map_io(chip, pdev, pdata); if (retval) - goto err_chip; + return retval; /* Make sure the v3020 expects a communication cycle * by reading 8 times */ @@ -364,7 +364,7 @@ static int rtc_probe(struct platform_device *pdev) err_io: chip->ops->unmap_io(chip); -err_chip: + return retval; } -- cgit v1.2.3 From 063feda69e3bf8a89122f315b199d8e379b35cdf Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:32 +1000 Subject: drivers/rtc/interface.c: fix checkpatch errors Fixes the following types of errors: ERROR: "foo* bar" should be "foo *bar" ERROR: else should follow close brace '}' WARNING: braces {} are not necessary for single statement blocks Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/interface.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 42bd57da239d..14c1efdd0757 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -109,9 +109,9 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs) err = rtc->ops->set_time(rtc->dev.parent, &new); } - } - else + } else { err = -EINVAL; + } mutex_unlock(&rtc->ops_lock); /* A timer might have just expired */ @@ -367,14 +367,14 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; - if (rtc->aie_timer.enabled) { + if (rtc->aie_timer.enabled) rtc_timer_remove(rtc, &rtc->aie_timer); - } + rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); rtc->aie_timer.period = ktime_set(0, 0); - if (alarm->enabled) { + if (alarm->enabled) err = rtc_timer_enqueue(rtc, &rtc->aie_timer); - } + mutex_unlock(&rtc->ops_lock); return err; } @@ -891,7 +891,7 @@ again: * * Kernel interface to initializing an rtc_timer. */ -void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data) +void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data) { timerqueue_init(&timer->node); timer->enabled = 0; @@ -907,7 +907,7 @@ void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data) * * Kernel interface to set an rtc_timer */ -int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, +int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period) { int ret = 0; @@ -930,7 +930,7 @@ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, * * Kernel interface to cancel an rtc_timer */ -int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer) +int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer) { int ret = 0; mutex_lock(&rtc->ops_lock); -- cgit v1.2.3 From 8961556681a8ecfb93ccc49f55d480b2d1409cc6 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:32 +1000 Subject: drivers/rtc/rtc-at32ap700x.c: fix checkpatch error Fixes the following error: ERROR: space required before the open parenthesis '(' Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-at32ap700x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index f47fbb5eee8b..128896da1a14 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -141,7 +141,7 @@ static int at32_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) spin_lock_irq(&rtc->lock); - if(enabled) { + if (enabled) { if (rtc_readl(rtc, VAL) > rtc->alarm_time) { ret = -EINVAL; goto out; -- cgit v1.2.3 From 81fedf39e9f8af69c1b928e6e7de9856b4d59cfe Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:33 +1000 Subject: drivers/rtc/rtc-at91rm9200.c: include Silences the following checkpatch warning: WARNING: Use #include instead of Signed-off-by: Sachin Kamat Cc: Andrew Victor Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-at91rm9200.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 0eab77b22340..0fea9d7a89d3 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -30,8 +30,7 @@ #include #include #include - -#include +#include #include "rtc-at91rm9200.h" -- cgit v1.2.3 From 5b6aa1ed147e384bcf994d7f3bbe20e5983ba7a8 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:33 +1000 Subject: drivers/rtc/rtc-cmos.c: fix whitespace related errors Fixes the following types of issues: ERROR: space required after that ',' (ctx:VxV) WARNING: please, no spaces at the start of a line Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-cmos.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index cc5bea9c4b1c..60b6fd4bee5f 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -326,7 +326,7 @@ static void cmos_irq_disable(struct cmos_rtc *cmos, unsigned char mask) static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned char mon, mday, hrs, min, sec, rtc_control; + unsigned char mon, mday, hrs, min, sec, rtc_control; if (!is_valid_irq(cmos->irq)) return -EIO; @@ -691,7 +691,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) /* FIXME: * doesn't know 12-hour mode either. */ - if (is_valid_irq(rtc_irq) && !(rtc_control & RTC_24H)) { + if (is_valid_irq(rtc_irq) && !(rtc_control & RTC_24H)) { dev_warn(dev, "only 24-hr supported\n"); retval = -ENXIO; goto cleanup1; @@ -989,7 +989,7 @@ static int cmos_pnp_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) { cmos_wake_setup(&pnp->dev); - if (pnp_port_start(pnp,0) == 0x70 && !pnp_irq_valid(pnp,0)) + if (pnp_port_start(pnp, 0) == 0x70 && !pnp_irq_valid(pnp, 0)) /* Some machines contain a PNP entry for the RTC, but * don't define the IRQ. It should always be safe to * hardcode it in these cases -- cgit v1.2.3 From ecab1d6cbbb9c935fccf16a6c0a0484565966d05 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:33 +1000 Subject: drivers/rtc/rtc-davinci.c: fix whitespace warning Silences the following warning: WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-davinci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index a55048c3e26f..be5fc32d3cd7 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -117,7 +117,7 @@ static DEFINE_SPINLOCK(davinci_rtc_lock); struct davinci_rtc { - struct rtc_device *rtc; + struct rtc_device *rtc; void __iomem *base; resource_size_t pbase; size_t base_size; -- cgit v1.2.3 From 3ab39ac081b0044e402c7424182c758dfe7eef5b Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:34 +1000 Subject: drivers/rtc/rtc-ds1305.c: add missing braces around sizeof Silences the following type of warnings: WARNING: sizeof buf should be sizeof(buf) Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ds1305.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c index bb5f13f63630..dd6170acde95 100644 --- a/drivers/rtc/rtc-ds1305.c +++ b/drivers/rtc/rtc-ds1305.c @@ -158,7 +158,7 @@ static int ds1305_alarm_irq_enable(struct device *dev, unsigned int enabled) goto done; buf[1] &= ~DS1305_AEI0; } - err = spi_write_then_read(ds1305->spi, buf, sizeof buf, NULL, 0); + err = spi_write_then_read(ds1305->spi, buf, sizeof(buf), NULL, 0); if (err >= 0) ds1305->ctrl[0] = buf[1]; done: @@ -181,8 +181,8 @@ static int ds1305_get_time(struct device *dev, struct rtc_time *time) /* Use write-then-read to get all the date/time registers * since dma from stack is nonportable */ - status = spi_write_then_read(ds1305->spi, &addr, sizeof addr, - buf, sizeof buf); + status = spi_write_then_read(ds1305->spi, &addr, sizeof(addr), + buf, sizeof(buf)); if (status < 0) return status; @@ -237,7 +237,7 @@ static int ds1305_set_time(struct device *dev, struct rtc_time *time) buf[4], buf[5], buf[6], buf[7]); /* use write-then-read since dma from stack is nonportable */ - return spi_write_then_read(ds1305->spi, buf, sizeof buf, + return spi_write_then_read(ds1305->spi, buf, sizeof(buf), NULL, 0); } @@ -286,8 +286,8 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm) * of EFI status is at best fragile anyway (given IRQ handlers). */ addr = DS1305_CONTROL; - status = spi_write_then_read(spi, &addr, sizeof addr, - ds1305->ctrl, sizeof ds1305->ctrl); + status = spi_write_then_read(spi, &addr, sizeof(addr), + ds1305->ctrl, sizeof(ds1305->ctrl)); if (status < 0) return status; @@ -296,8 +296,8 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm) /* get and check ALM0 registers */ addr = DS1305_ALM0(DS1305_SEC); - status = spi_write_then_read(spi, &addr, sizeof addr, - buf, sizeof buf); + status = spi_write_then_read(spi, &addr, sizeof(addr), + buf, sizeof(buf)); if (status < 0) return status; @@ -381,7 +381,7 @@ static int ds1305_set_alarm(struct device *dev, struct rtc_wkalrm *alm) "alm0 write", buf[1 + DS1305_SEC], buf[1 + DS1305_MIN], buf[1 + DS1305_HOUR], buf[1 + DS1305_WDAY]); - status = spi_write_then_read(spi, buf, sizeof buf, NULL, 0); + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); if (status < 0) return status; @@ -474,7 +474,7 @@ static void ds1305_work(struct work_struct *work) buf[1] = ds1305->ctrl[0]; buf[2] = 0; - status = spi_write_then_read(spi, buf, sizeof buf, + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); if (status < 0) dev_dbg(&spi->dev, "clear irq --> %d\n", status); @@ -627,8 +627,8 @@ static int ds1305_probe(struct spi_device *spi) /* read and cache control registers */ addr = DS1305_CONTROL; - status = spi_write_then_read(spi, &addr, sizeof addr, - ds1305->ctrl, sizeof ds1305->ctrl); + status = spi_write_then_read(spi, &addr, sizeof(addr), + ds1305->ctrl, sizeof(ds1305->ctrl)); if (status < 0) { dev_dbg(&spi->dev, "can't %s, %d\n", "read", status); @@ -659,7 +659,7 @@ static int ds1305_probe(struct spi_device *spi) buf[0] = DS1305_WRITE | DS1305_CONTROL; buf[1] = ds1305->ctrl[0]; - status = spi_write_then_read(spi, buf, sizeof buf, NULL, 0); + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); dev_dbg(&spi->dev, "clear WP --> %d\n", status); if (status < 0) @@ -713,7 +713,7 @@ static int ds1305_probe(struct spi_device *spi) buf[1] = ds1305->ctrl[0]; buf[2] = ds1305->ctrl[1]; buf[3] = ds1305->ctrl[2]; - status = spi_write_then_read(spi, buf, sizeof buf, NULL, 0); + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); if (status < 0) { dev_dbg(&spi->dev, "can't %s, %d\n", "write", status); @@ -725,8 +725,8 @@ static int ds1305_probe(struct spi_device *spi) /* see if non-Linux software set up AM/PM mode */ addr = DS1305_HOUR; - status = spi_write_then_read(spi, &addr, sizeof addr, - &value, sizeof value); + status = spi_write_then_read(spi, &addr, sizeof(addr), + &value, sizeof(value)); if (status < 0) { dev_dbg(&spi->dev, "read HOUR --> %d\n", status); return status; -- cgit v1.2.3 From c3680d20982ede4cf9d92e4b9fb9c40a042834ba Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:34 +1000 Subject: drivers/rtc/rtc-ds1374.c: fix spacing related issues Fixes the following types of issues: ERROR: code indent should use tabs where possible WARNING: please, no spaces at the start of a line Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ds1374.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c index 94366e12f40f..9e6e14fb53d7 100644 --- a/drivers/rtc/rtc-ds1374.c +++ b/drivers/rtc/rtc-ds1374.c @@ -65,7 +65,7 @@ struct ds1374 { static struct i2c_driver ds1374_driver; static int ds1374_read_rtc(struct i2c_client *client, u32 *time, - int reg, int nbytes) + int reg, int nbytes) { u8 buf[4]; int ret; @@ -90,7 +90,7 @@ static int ds1374_read_rtc(struct i2c_client *client, u32 *time, } static int ds1374_write_rtc(struct i2c_client *client, u32 time, - int reg, int nbytes) + int reg, int nbytes) { u8 buf[4]; int i; @@ -119,8 +119,7 @@ static int ds1374_check_rtc_status(struct i2c_client *client) if (stat & DS1374_REG_SR_OSF) dev_warn(&client->dev, - "oscillator discontinuity flagged, " - "time unreliable\n"); + "oscillator discontinuity flagged, time unreliable\n"); stat &= ~(DS1374_REG_SR_OSF | DS1374_REG_SR_AF); @@ -363,7 +362,7 @@ static int ds1374_probe(struct i2c_client *client, if (client->irq > 0) { ret = devm_request_irq(&client->dev, client->irq, ds1374_irq, 0, - "ds1374", client); + "ds1374", client); if (ret) { dev_err(&client->dev, "unable to request IRQ\n"); return ret; @@ -373,7 +372,7 @@ static int ds1374_probe(struct i2c_client *client, } ds1374->rtc = devm_rtc_device_register(&client->dev, client->name, - &ds1374_rtc_ops, THIS_MODULE); + &ds1374_rtc_ops, THIS_MODULE); if (IS_ERR(ds1374->rtc)) { dev_err(&client->dev, "unable to register the class device\n"); return PTR_ERR(ds1374->rtc); -- cgit v1.2.3 From 797c05048209929dc028cb933ad17e6644c7a8d9 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:34 +1000 Subject: drivers/rtc/rtc-ds1511.c: fix issues related to spaces and braces Fixes the following types of issues: WARNING: please, no spaces at the start of a line WARNING: braces {} are not necessary for single statement blocks Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ds1511.c | 93 +++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 49 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 6ce8a997cf51..308a8fefe76f 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -104,31 +104,31 @@ static DEFINE_SPINLOCK(ds1511_lock); static __iomem char *ds1511_base; static u32 reg_spacing = 1; - static noinline void +static noinline void rtc_write(uint8_t val, uint32_t reg) { writeb(val, ds1511_base + (reg * reg_spacing)); } - static inline void +static inline void rtc_write_alarm(uint8_t val, enum ds1511reg reg) { rtc_write((val | 0x80), reg); } - static noinline uint8_t +static noinline uint8_t rtc_read(enum ds1511reg reg) { return readb(ds1511_base + (reg * reg_spacing)); } - static inline void +static inline void rtc_disable_update(void) { rtc_write((rtc_read(RTC_CMD) & ~RTC_TE), RTC_CMD); } - static void +static void rtc_enable_update(void) { rtc_write((rtc_read(RTC_CMD) | RTC_TE), RTC_CMD); @@ -145,7 +145,7 @@ rtc_enable_update(void) * just enough code to set the watchdog timer so that it * will reboot the system */ - void +void ds1511_wdog_set(unsigned long deciseconds) { /* @@ -163,7 +163,7 @@ ds1511_wdog_set(unsigned long deciseconds) rtc_write(DS1511_WDE | DS1511_WDS, RTC_CMD); } - void +void ds1511_wdog_disable(void) { /* @@ -191,13 +191,12 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) /* * won't have to change this for a while */ - if (rtc_tm->tm_year < 1900) { + if (rtc_tm->tm_year < 1900) rtc_tm->tm_year += 1900; - } - if (rtc_tm->tm_year < 1970) { + if (rtc_tm->tm_year < 1970) return -EINVAL; - } + yrs = rtc_tm->tm_year % 100; cen = rtc_tm->tm_year / 100; mon = rtc_tm->tm_mon + 1; /* tm_mon starts at zero */ @@ -207,17 +206,14 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) min = rtc_tm->tm_min; sec = rtc_tm->tm_sec; - if ((mon > 12) || (day == 0)) { + if ((mon > 12) || (day == 0)) return -EINVAL; - } - if (day > rtc_month_days(rtc_tm->tm_mon, rtc_tm->tm_year)) { + if (day > rtc_month_days(rtc_tm->tm_mon, rtc_tm->tm_year)) return -EINVAL; - } - if ((hrs >= 24) || (min >= 60) || (sec >= 60)) { + if ((hrs >= 24) || (min >= 60) || (sec >= 60)) return -EINVAL; - } /* * each register is a different number of valid bits @@ -299,7 +295,7 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm) * date/hours/mins/secs matches. the ds1511 has many more * permutations, but the kernel doesn't. */ - static void +static void ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) { unsigned long flags; @@ -322,7 +318,7 @@ ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) spin_unlock_irqrestore(&pdata->lock, flags); } - static int +static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct platform_device *pdev = to_platform_device(dev); @@ -335,14 +331,14 @@ ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) pdata->alrm_hour = alrm->time.tm_hour; pdata->alrm_min = alrm->time.tm_min; pdata->alrm_sec = alrm->time.tm_sec; - if (alrm->enabled) { + if (alrm->enabled) pdata->irqen |= RTC_AF; - } + ds1511_rtc_update_alarm(pdata); return 0; } - static int +static int ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct platform_device *pdev = to_platform_device(dev); @@ -359,7 +355,7 @@ ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } - static irqreturn_t +static irqreturn_t ds1511_interrupt(int irq, void *dev_id) { struct platform_device *pdev = dev_id; @@ -406,7 +402,7 @@ static const struct rtc_class_ops ds1511_rtc_ops = { .alarm_irq_enable = ds1511_rtc_alarm_irq_enable, }; - static ssize_t +static ssize_t ds1511_nvram_read(struct file *filp, struct kobject *kobj, struct bin_attribute *ba, char *buf, loff_t pos, size_t size) @@ -417,26 +413,26 @@ ds1511_nvram_read(struct file *filp, struct kobject *kobj, * if count is more than one, turn on "burst" mode * turn it off when you're done */ - if (size > 1) { + if (size > 1) rtc_write((rtc_read(RTC_CMD) | DS1511_BME), RTC_CMD); - } - if (pos > DS1511_RAM_MAX) { + + if (pos > DS1511_RAM_MAX) pos = DS1511_RAM_MAX; - } - if (size + pos > DS1511_RAM_MAX + 1) { + + if (size + pos > DS1511_RAM_MAX + 1) size = DS1511_RAM_MAX - pos + 1; - } + rtc_write(pos, DS1511_RAMADDR_LSB); - for (count = 0; size > 0; count++, size--) { + for (count = 0; size > 0; count++, size--) *buf++ = rtc_read(DS1511_RAMDATA); - } - if (count > 1) { + + if (count > 1) rtc_write((rtc_read(RTC_CMD) & ~DS1511_BME), RTC_CMD); - } + return count; } - static ssize_t +static ssize_t ds1511_nvram_write(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t size) @@ -447,22 +443,22 @@ ds1511_nvram_write(struct file *filp, struct kobject *kobj, * if count is more than one, turn on "burst" mode * turn it off when you're done */ - if (size > 1) { + if (size > 1) rtc_write((rtc_read(RTC_CMD) | DS1511_BME), RTC_CMD); - } - if (pos > DS1511_RAM_MAX) { + + if (pos > DS1511_RAM_MAX) pos = DS1511_RAM_MAX; - } - if (size + pos > DS1511_RAM_MAX + 1) { + + if (size + pos > DS1511_RAM_MAX + 1) size = DS1511_RAM_MAX - pos + 1; - } + rtc_write(pos, DS1511_RAMADDR_LSB); - for (count = 0; size > 0; count++, size--) { + for (count = 0; size > 0; count++, size--) rtc_write(*buf++, DS1511_RAMDATA); - } - if (count > 1) { + + if (count > 1) rtc_write((rtc_read(RTC_CMD) & ~DS1511_BME), RTC_CMD); - } + return count; } @@ -484,9 +480,9 @@ static int ds1511_rtc_probe(struct platform_device *pdev) int ret = 0; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { + if (!res) return -ENODEV; - } + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; @@ -518,9 +514,8 @@ static int ds1511_rtc_probe(struct platform_device *pdev) /* * check for a dying bat-tree */ - if (rtc_read(RTC_CMD1) & DS1511_BLF1) { + if (rtc_read(RTC_CMD1) & DS1511_BLF1) dev_warn(&pdev->dev, "voltage-low detected.\n"); - } spin_lock_init(&pdata->lock); platform_set_drvdata(pdev, pdata); -- cgit v1.2.3 From 032a57cad72a77c4297c8039bee0a33fbed352aa Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:35 +1000 Subject: drivers/rtc/rtc-ds3234.c: fix whitespace issue Fixes the following warning: WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ds3234.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-ds3234.c b/drivers/rtc/rtc-ds3234.c index ba98c0e9580d..67f1a80112e2 100644 --- a/drivers/rtc/rtc-ds3234.c +++ b/drivers/rtc/rtc-ds3234.c @@ -73,7 +73,7 @@ static int ds3234_read_time(struct device *dev, struct rtc_time *dt) dt->tm_wday = bcd2bin(buf[3]) - 1; /* 0 = Sun */ dt->tm_mday = bcd2bin(buf[4]); dt->tm_mon = bcd2bin(buf[5] & 0x1f) - 1; /* 0 = Jan */ - dt->tm_year = bcd2bin(buf[6] & 0xff) + 100; /* Assume 20YY */ + dt->tm_year = bcd2bin(buf[6] & 0xff) + 100; /* Assume 20YY */ return rtc_valid_tm(dt); } -- cgit v1.2.3 From 29761fd6fe6d9e11738d602d381da5c9e2e449e0 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:35 +1000 Subject: drivers/rtc/rtc-fm3130.c: fix whitespace related issue Silences the following checkpatch warning: WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-fm3130.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-fm3130.c b/drivers/rtc/rtc-fm3130.c index 2835fb6c1965..8f053daf0f0c 100644 --- a/drivers/rtc/rtc-fm3130.c +++ b/drivers/rtc/rtc-fm3130.c @@ -47,7 +47,7 @@ struct fm3130 { u8 reg_addr_time; - u8 reg_addr_alarm; + u8 reg_addr_alarm; u8 regs[15]; struct i2c_msg msg[4]; struct i2c_client *client; -- cgit v1.2.3 From d5a9bce34d732a9cbd71d021f52fd7ca7f83b2ef Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:35 +1000 Subject: drivers/rtc/rtc-m41t80.c: fix spacing related issue Silences the following checkpatch warning: WARNING: space prohibited before semicolon Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-m41t80.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index 89674b5e6efd..a5248aa1abf1 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -168,7 +168,7 @@ static int m41t80_set_datetime(struct i2c_client *client, struct rtc_time *tm) buf[M41T80_REG_MIN] = bin2bcd(tm->tm_min) | (buf[M41T80_REG_MIN] & ~0x7f); buf[M41T80_REG_HOUR] = - bin2bcd(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f) ; + bin2bcd(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f); buf[M41T80_REG_WDAY] = (tm->tm_wday & 0x07) | (buf[M41T80_REG_WDAY] & ~0x07); buf[M41T80_REG_DAY] = -- cgit v1.2.3 From a09f61e8cb382a6934e1ae9b08a901378f535a28 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:36 +1000 Subject: drivers/rtc/rtc-max6902.c: remove unwanted spaces Silences the following type of warnings: WARNING: space prohibited between function name and open parenthesis '(' Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max6902.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index e3aea00c3145..e9dd56c8ffe7 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -159,7 +159,7 @@ static struct spi_driver max6902_driver = { module_spi_driver(max6902_driver); -MODULE_DESCRIPTION ("max6902 spi RTC driver"); -MODULE_AUTHOR ("Raphael Assenat"); -MODULE_LICENSE ("GPL"); +MODULE_DESCRIPTION("max6902 spi RTC driver"); +MODULE_AUTHOR("Raphael Assenat"); +MODULE_LICENSE("GPL"); MODULE_ALIAS("spi:rtc-max6902"); -- cgit v1.2.3 From 9664178c6d97175dd726b469294271aee638b347 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:36 +1000 Subject: drivers/rtc/rtc-max77686.c: remove space before semicolon Fixes the following warning: WARNING: space prohibited before semicolon Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max77686.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c index 771812d62e6b..441c5c2e0bfc 100644 --- a/drivers/rtc/rtc-max77686.c +++ b/drivers/rtc/rtc-max77686.c @@ -119,7 +119,7 @@ static int max77686_rtc_tm_to_data(struct rtc_time *tm, u8 *data) data[RTC_WEEKDAY] = 1 << tm->tm_wday; data[RTC_DATE] = tm->tm_mday; data[RTC_MONTH] = tm->tm_mon + 1; - data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0 ; + data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0; if (tm->tm_year < 100) { pr_warn("%s: MAX77686 RTC cannot handle the year %d." -- cgit v1.2.3 From 32e1202ad75f77b2601e7fd3418c96e8200c31b3 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:36 +1000 Subject: drivers/rtc/rtc-max8997.c: remove space before semicolon Fixes the following warning: WARNING: space prohibited before semicolon Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max8997.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-max8997.c b/drivers/rtc/rtc-max8997.c index dacf48db7925..168390428f86 100644 --- a/drivers/rtc/rtc-max8997.c +++ b/drivers/rtc/rtc-max8997.c @@ -104,7 +104,7 @@ static int max8997_rtc_tm_to_data(struct rtc_time *tm, u8 *data) data[RTC_WEEKDAY] = 1 << tm->tm_wday; data[RTC_DATE] = tm->tm_mday; data[RTC_MONTH] = tm->tm_mon + 1; - data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0 ; + data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0; if (tm->tm_year < 100) { pr_warn("%s: MAX8997 RTC cannot handle the year %d." -- cgit v1.2.3 From 6679d05f3f0d9f3f186a7b9d69735c73d44329a8 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:37 +1000 Subject: drivers/rtc/rtc-mpc5121.c: remove space before tab WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-mpc5121.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c index bdcc60830aec..213006bfc69a 100644 --- a/drivers/rtc/rtc-mpc5121.c +++ b/drivers/rtc/rtc-mpc5121.c @@ -68,7 +68,7 @@ struct mpc5121_rtc_regs { u32 target_time; /* RTC + 0x20 */ /* * actual_time: - * readonly time since VBAT_RTC was last connected + * readonly time since VBAT_RTC was last connected */ u32 actual_time; /* RTC + 0x24 */ u32 keep_alive; /* RTC + 0x28 */ -- cgit v1.2.3 From 2a378411ceedb0b3519e1b3f39ff0bc3e042544b Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:37 +1000 Subject: drivers/rtc/rtc-msm6242.c: use pr_warn pr_warn is preferred to pr_warning. Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-msm6242.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c index 771f86a05d14..f0e9893ae7be 100644 --- a/drivers/rtc/rtc-msm6242.c +++ b/drivers/rtc/rtc-msm6242.c @@ -111,8 +111,8 @@ static void msm6242_lock(struct msm6242_priv *priv) } if (!cnt) - pr_warning("msm6242: timed out waiting for RTC (0x%x)\n", - msm6242_read(priv, MSM6242_CD)); + pr_warn("msm6242: timed out waiting for RTC (0x%x)\n", + msm6242_read(priv, MSM6242_CD)); } static void msm6242_unlock(struct msm6242_priv *priv) -- cgit v1.2.3 From be12ef9649a6bcc077cd96d901f1b06ec746514f Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:37 +1000 Subject: drivers/rtc/rtc-mxc.c: fix checkpatch error Fixes the following error: ERROR: spaces required around that '>=' (ctx:WxV) Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-mxc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 9a3895bc4f4d..bf00d6dbeadd 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -436,7 +436,7 @@ static int mxc_rtc_probe(struct platform_device *pdev) pdata->irq = -1; } - if (pdata->irq >=0) + if (pdata->irq >= 0) device_init_wakeup(&pdev->dev, 1); rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &mxc_rtc_ops, -- cgit v1.2.3 From 696bc66606a8548a5f8b242c9942da8072e5c967 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:38 +1000 Subject: drivers/rtc/rtc-omap.c: include instead of Use #include instead of as pointed out by checkpatch. Signed-off-by: Sachin Kamat Cc: George G. Davis Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-omap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index b0ba3fc991ea..6f6ac033d5d9 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -23,9 +23,7 @@ #include #include #include - -#include - +#include /* The OMAP1 RTC is a year/month/day/hours/minutes/seconds BCD clock * with century-range alarm matching, driven by the 32kHz clock. -- cgit v1.2.3 From 401772eabbc04b9518fbde0be047db3f22f9fd60 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:38 +1000 Subject: drivers/rtc/rtc-pcf2123.c: remove space before tabs Silences the following checkpatch warning: WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-pcf2123.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c index 796a6c5067dd..b2a78a02cf16 100644 --- a/drivers/rtc/rtc-pcf2123.c +++ b/drivers/rtc/rtc-pcf2123.c @@ -18,11 +18,11 @@ * should look something like: * * static struct spi_board_info ek_spi_devices[] = { - * ... - * { - * .modalias = "rtc-pcf2123", - * .chip_select = 1, - * .controller_data = (void *)AT91_PIN_PA10, + * ... + * { + * .modalias = "rtc-pcf2123", + * .chip_select = 1, + * .controller_data = (void *)AT91_PIN_PA10, * .max_speed_hz = 1000 * 1000, * .mode = SPI_CS_HIGH, * .bus_num = 0, -- cgit v1.2.3 From 47d2efc298c17e59759c41bedf24fff7f3cb7b52 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:38 +1000 Subject: drivers/rtc/rtc-pcf8583.c: move assignment outside if condition Fixes the following checkpatch error: ERROR: do not use assignment in if condition Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-pcf8583.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-pcf8583.c b/drivers/rtc/rtc-pcf8583.c index 95886dcf4a39..9971f7f7cac8 100644 --- a/drivers/rtc/rtc-pcf8583.c +++ b/drivers/rtc/rtc-pcf8583.c @@ -188,7 +188,8 @@ static int pcf8583_rtc_read_time(struct device *dev, struct rtc_time *tm) dev_warn(dev, "resetting control %02x -> %02x\n", ctrl, new_ctrl); - if ((err = pcf8583_set_ctrl(client, &new_ctrl)) < 0) + err = pcf8583_set_ctrl(client, &new_ctrl); + if (err < 0) return err; } -- cgit v1.2.3 From a2ec1272869fdc73fdee4c765f1d81e7b8bfb71b Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:38 +1000 Subject: drivers/rtc/rtc-rs5c313.c: include instead of Use #include instead of as pointed out by checkpatch. Signed-off-by: Sachin Kamat Cc: Nobuhiro Iwamatsu Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-rs5c313.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-rs5c313.c b/drivers/rtc/rtc-rs5c313.c index 8089fc63e403..b603cc41ea77 100644 --- a/drivers/rtc/rtc-rs5c313.c +++ b/drivers/rtc/rtc-rs5c313.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #define DRV_NAME "rs5c313" #define DRV_VERSION "1.13" -- cgit v1.2.3 From df2b2d409dab3b6c7b85a598af33b90f40caef54 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:39 +1000 Subject: drivers/rtc/rtc-rs5c313.c: fix spacing related issues Fixes the following types of checkpatch issues: WARNING: please, no space before tabs ERROR: space prohibited after that open parenthesis '(' ERROR: space prohibited before that close parenthesis ')' ERROR: need consistent spacing around '>>' (ctx:VxW) Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-rs5c313.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-rs5c313.c b/drivers/rtc/rtc-rs5c313.c index b603cc41ea77..6af0f1f95f63 100644 --- a/drivers/rtc/rtc-rs5c313.c +++ b/drivers/rtc/rtc-rs5c313.c @@ -50,7 +50,7 @@ #include #define DRV_NAME "rs5c313" -#define DRV_VERSION "1.13" +#define DRV_VERSION "1.13" #ifdef CONFIG_SH_LANDISK /*****************************************************/ @@ -301,7 +301,7 @@ static int rs5c313_rtc_set_time(struct device *dev, struct rtc_time *tm) rs5c313_write_reg(RS5C313_ADDR_SEC10, (data >> 4)); data = bin2bcd(tm->tm_min); - rs5c313_write_reg(RS5C313_ADDR_MIN, data ); + rs5c313_write_reg(RS5C313_ADDR_MIN, data); rs5c313_write_reg(RS5C313_ADDR_MIN10, (data >> 4)); data = bin2bcd(tm->tm_hour); @@ -310,7 +310,7 @@ static int rs5c313_rtc_set_time(struct device *dev, struct rtc_time *tm) data = bin2bcd(tm->tm_mday); rs5c313_write_reg(RS5C313_ADDR_DAY, data); - rs5c313_write_reg(RS5C313_ADDR_DAY10, (data>> 4)); + rs5c313_write_reg(RS5C313_ADDR_DAY10, (data >> 4)); data = bin2bcd(tm->tm_mon + 1); rs5c313_write_reg(RS5C313_ADDR_MON, data); @@ -349,9 +349,9 @@ static void rs5c313_check_xstp_bit(void) } memset(&tm, 0, sizeof(struct rtc_time)); - tm.tm_mday = 1; - tm.tm_mon = 1 - 1; - tm.tm_year = 2000 - 1900; + tm.tm_mday = 1; + tm.tm_mon = 1 - 1; + tm.tm_year = 2000 - 1900; rs5c313_rtc_set_time(NULL, &tm); pr_err("invalid value, resetting to 1 Jan 2000\n"); @@ -388,7 +388,7 @@ static struct platform_driver rs5c313_rtc_platform_driver = { .name = DRV_NAME, .owner = THIS_MODULE, }, - .probe = rs5c313_rtc_probe, + .probe = rs5c313_rtc_probe, .remove = rs5c313_rtc_remove, }; @@ -408,7 +408,7 @@ static int __init rs5c313_rtc_init(void) static void __exit rs5c313_rtc_exit(void) { - platform_driver_unregister( &rs5c313_rtc_platform_driver ); + platform_driver_unregister(&rs5c313_rtc_platform_driver); } module_init(rs5c313_rtc_init); -- cgit v1.2.3 From b0b3713ccb225249712c097e1c6ba6649d7cdd16 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:39 +1000 Subject: drivers/rtc/rtc-v3020.c: fix spacing issues Fixes the following type of issues: WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-v3020.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 4ee0e63ebba7..d07d89823020 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -16,7 +16,7 @@ * - Use the generic rtc class * * ??-???-2004: Someone at Compulab - * - Initial driver creation. + * - Initial driver creation. * */ #include @@ -278,13 +278,13 @@ static int v3020_set_time(struct device *dev, struct rtc_time *dt) dev_dbg(dev, "tm_year: %i\n", dt->tm_year); /* Write all the values to ram... */ - v3020_set_reg(chip, V3020_SECONDS, bin2bcd(dt->tm_sec)); - v3020_set_reg(chip, V3020_MINUTES, bin2bcd(dt->tm_min)); - v3020_set_reg(chip, V3020_HOURS, bin2bcd(dt->tm_hour)); + v3020_set_reg(chip, V3020_SECONDS, bin2bcd(dt->tm_sec)); + v3020_set_reg(chip, V3020_MINUTES, bin2bcd(dt->tm_min)); + v3020_set_reg(chip, V3020_HOURS, bin2bcd(dt->tm_hour)); v3020_set_reg(chip, V3020_MONTH_DAY, bin2bcd(dt->tm_mday)); - v3020_set_reg(chip, V3020_MONTH, bin2bcd(dt->tm_mon + 1)); - v3020_set_reg(chip, V3020_WEEK_DAY, bin2bcd(dt->tm_wday)); - v3020_set_reg(chip, V3020_YEAR, bin2bcd(dt->tm_year % 100)); + v3020_set_reg(chip, V3020_MONTH, bin2bcd(dt->tm_mon + 1)); + v3020_set_reg(chip, V3020_WEEK_DAY, bin2bcd(dt->tm_wday)); + v3020_set_reg(chip, V3020_YEAR, bin2bcd(dt->tm_year % 100)); /* ...and set the clock. */ v3020_set_reg(chip, V3020_CMD_RAM2CLOCK, 0); -- cgit v1.2.3 From a86773e416b3998aaca055531190771a0a66ac55 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:39 +1000 Subject: drivers/rtc/rtc-vr41xx.c: fix spacing issues Fixes the following types of issues: ERROR: code indent should use tabs where possible Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-vr41xx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index f91be04b9050..b940c2374904 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -103,7 +103,7 @@ static inline unsigned long read_elapsed_second(void) second_mid = rtc1_read(ETIMEMREG); second_high = rtc1_read(ETIMEHREG); } while (first_low != second_low || first_mid != second_mid || - first_high != second_high); + first_high != second_high); return (first_high << 17) | (first_mid << 1) | (first_low >> 15); } @@ -154,7 +154,7 @@ static int vr41xx_rtc_set_time(struct device *dev, struct rtc_time *time) epoch_sec = mktime(epoch, 1, 1, 0, 0, 0); current_sec = mktime(time->tm_year + 1900, time->tm_mon + 1, time->tm_mday, - time->tm_hour, time->tm_min, time->tm_sec); + time->tm_hour, time->tm_min, time->tm_sec); write_elapsed_second(current_sec - epoch_sec); @@ -186,7 +186,7 @@ static int vr41xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) struct rtc_time *time = &wkalrm->time; alarm_sec = mktime(time->tm_year + 1900, time->tm_mon + 1, time->tm_mday, - time->tm_hour, time->tm_min, time->tm_sec); + time->tm_hour, time->tm_min, time->tm_sec); spin_lock_irq(&rtc_lock); @@ -334,7 +334,7 @@ static int rtc_probe(struct platform_device *pdev) } retval = request_irq(aie_irq, elapsedtime_interrupt, 0, - "elapsed_time", pdev); + "elapsed_time", pdev); if (retval < 0) goto err_device_unregister; @@ -343,7 +343,7 @@ static int rtc_probe(struct platform_device *pdev) goto err_free_irq; retval = request_irq(pie_irq, rtclong1_interrupt, 0, - "rtclong1", pdev); + "rtclong1", pdev); if (retval < 0) goto err_free_irq; -- cgit v1.2.3 From 454830a8f1d3c8e08ce2b1d73425d1ac6e655329 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:40 +1000 Subject: drivers/rtc/rtc-x1205.c: fix checkpatch issues Fixes the following types of issues: ERROR: do not use assignment in if condition ERROR: open brace '{' following struct go on the same line ERROR: else should follow close brace '}' WARNING: please, no space before tabs Signed-off-by: Sachin Kamat Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-x1205.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c index fa9b0679fb60..365dc6505148 100644 --- a/drivers/rtc/rtc-x1205.c +++ b/drivers/rtc/rtc-x1205.c @@ -4,7 +4,7 @@ * Copyright 2005 Alessandro Zummo * * please send all reports to: - * Karen Spearel + * Karen Spearel * Alessandro Zummo * * based on a lot of other RTC drivers. @@ -215,12 +215,14 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm, buf[i] |= 0x80; /* this sequence is required to unlock the chip */ - if ((xfer = i2c_master_send(client, wel, 3)) != 3) { + xfer = i2c_master_send(client, wel, 3); + if (xfer != 3) { dev_err(&client->dev, "%s: wel - %d\n", __func__, xfer); return -EIO; } - if ((xfer = i2c_master_send(client, rwel, 3)) != 3) { + xfer = i2c_master_send(client, rwel, 3); + if (xfer != 3) { dev_err(&client->dev, "%s: rwel - %d\n", __func__, xfer); return -EIO; } @@ -269,7 +271,8 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm, } /* disable further writes */ - if ((xfer = i2c_master_send(client, diswe, 3)) != 3) { + xfer = i2c_master_send(client, diswe, 3); + if (xfer != 3) { dev_err(&client->dev, "%s: diswe - %d\n", __func__, xfer); return -EIO; } @@ -375,8 +378,7 @@ static int x1205_get_atrim(struct i2c_client *client, int *trim) return 0; } -struct x1205_limit -{ +struct x1205_limit { unsigned char reg, mask, min, max; }; @@ -430,7 +432,8 @@ static int x1205_validate_client(struct i2c_client *client) }, }; - if ((xfer = i2c_transfer(client->adapter, msgs, 2)) != 2) { + xfer = i2c_transfer(client->adapter, msgs, 2); + if (xfer != 2) { dev_err(&client->dev, "%s: could not read register %x\n", __func__, probe_zero_pattern[i]); @@ -467,7 +470,8 @@ static int x1205_validate_client(struct i2c_client *client) }, }; - if ((xfer = i2c_transfer(client->adapter, msgs, 2)) != 2) { + xfer = i2c_transfer(client->adapter, msgs, 2); + if (xfer != 2) { dev_err(&client->dev, "%s: could not read register %x\n", __func__, probe_limits_pattern[i].reg); @@ -548,10 +552,12 @@ static int x1205_rtc_proc(struct device *dev, struct seq_file *seq) { int err, dtrim, atrim; - if ((err = x1205_get_dtrim(to_i2c_client(dev), &dtrim)) == 0) + err = x1205_get_dtrim(to_i2c_client(dev), &dtrim); + if (!err) seq_printf(seq, "digital_trim\t: %d ppm\n", dtrim); - if ((err = x1205_get_atrim(to_i2c_client(dev), &atrim)) == 0) + err = x1205_get_atrim(to_i2c_client(dev), &atrim); + if (!err) seq_printf(seq, "analog_trim\t: %d.%02d pF\n", atrim / 1000, atrim % 1000); return 0; @@ -639,7 +645,8 @@ static int x1205_probe(struct i2c_client *client, i2c_set_clientdata(client, rtc); /* Check for power failures and eventually enable the osc */ - if ((err = x1205_get_status(client, &sr)) == 0) { + err = x1205_get_status(client, &sr); + if (!err) { if (sr & X1205_SR_RTCF) { dev_err(&client->dev, "power failure detected, " @@ -647,9 +654,9 @@ static int x1205_probe(struct i2c_client *client, udelay(50); x1205_fix_osc(client); } - } - else + } else { dev_err(&client->dev, "couldn't read status\n"); + } err = x1205_sysfs_register(&client->dev); if (err) -- cgit v1.2.3 From 30dce8f48c18b06c977e252baed39e23ca5c9146 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:40 +1000 Subject: rtc: rtc-88pm860x: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-88pm860x.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-88pm860x.c b/drivers/rtc/rtc-88pm860x.c index 0f2b91bfee37..4e30c85728e5 100644 --- a/drivers/rtc/rtc-88pm860x.c +++ b/drivers/rtc/rtc-88pm860x.c @@ -418,7 +418,6 @@ static int pm860x_rtc_remove(struct platform_device *pdev) pm860x_set_bits(info->i2c, PM8607_MEAS_EN2, MEAS2_VRTC, 0); #endif /* VRTC_CALIBRATION */ - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From 30601d7f84deb92edf969a3d9f06e5c290456a60 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:40 +1000 Subject: rtc: rtc-ab3100: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Lars-Peter Clausen Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ab3100.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-ab3100.c b/drivers/rtc/rtc-ab3100.c index 47a4f2c4d30e..572208d67688 100644 --- a/drivers/rtc/rtc-ab3100.c +++ b/drivers/rtc/rtc-ab3100.c @@ -242,7 +242,6 @@ static int __init ab3100_rtc_probe(struct platform_device *pdev) static int __exit ab3100_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From 16f3e19731502a5a359705c7ad4f8c4f21fa5121 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:41 +1000 Subject: rtc: rtc-ab8500: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Srinidhi Kasagar Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ab8500.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 63cfa314a39f..c5b62d4389ee 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -451,8 +451,6 @@ static int ab8500_rtc_remove(struct platform_device *pdev) { ab8500_sysfs_rtc_unregister(&pdev->dev); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 313f61dcd16e74d8a51780502fbe3ea72766bc2d Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:41 +1000 Subject: rtc: rtc-at32ap700x: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-at32ap700x.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index 128896da1a14..3161ab5263ed 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -212,23 +212,20 @@ static int __init at32_rtc_probe(struct platform_device *pdev) regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!regs) { dev_dbg(&pdev->dev, "no mmio resource defined\n"); - ret = -ENXIO; - goto out; + return -ENXIO; } irq = platform_get_irq(pdev, 0); if (irq <= 0) { dev_dbg(&pdev->dev, "could not get irq\n"); - ret = -ENXIO; - goto out; + return -ENXIO; } rtc->irq = irq; rtc->regs = devm_ioremap(&pdev->dev, regs->start, resource_size(regs)); if (!rtc->regs) { - ret = -ENOMEM; dev_dbg(&pdev->dev, "could not map I/O memory\n"); - goto out; + return -ENOMEM; } spin_lock_init(&rtc->lock); @@ -249,7 +246,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) "rtc", rtc); if (ret) { dev_dbg(&pdev->dev, "could not request irq %d\n", irq); - goto out; + return ret; } platform_set_drvdata(pdev, rtc); @@ -258,8 +255,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) &at32_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { dev_dbg(&pdev->dev, "could not register rtc device\n"); - ret = PTR_ERR(rtc->rtc); - goto out; + return PTR_ERR(rtc->rtc); } device_init_wakeup(&pdev->dev, 1); @@ -268,18 +264,12 @@ static int __init at32_rtc_probe(struct platform_device *pdev) (unsigned long)rtc->regs, rtc->irq); return 0; - -out: - platform_set_drvdata(pdev, NULL); - return ret; } static int __exit at32_rtc_remove(struct platform_device *pdev) { device_init_wakeup(&pdev->dev, 0); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 3ca7dccf69a80789f095a926a73eae2acbabea90 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:41 +1000 Subject: rtc: rtc-at91rm9200: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-at91rm9200.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 0fea9d7a89d3..38ef46a9094e 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -341,7 +341,6 @@ static int __exit at91_rtc_remove(struct platform_device *pdev) rtc_device_unregister(rtc); iounmap(at91_rtc_regs); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From e6ad0d2a134366d61c18f921a0f037b7be04fef7 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:42 +1000 Subject: rtc: rtc-at91sam9: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-at91sam9.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c index b60a34cb145a..309b8b342d9c 100644 --- a/drivers/rtc/rtc-at91sam9.c +++ b/drivers/rtc/rtc-at91sam9.c @@ -324,16 +324,14 @@ static int at91_rtc_probe(struct platform_device *pdev) rtc->rtt = devm_ioremap(&pdev->dev, r->start, resource_size(r)); if (!rtc->rtt) { dev_err(&pdev->dev, "failed to map registers, aborting.\n"); - ret = -ENOMEM; - goto fail; + return -ENOMEM; } rtc->gpbr = devm_ioremap(&pdev->dev, r_gpbr->start, resource_size(r_gpbr)); if (!rtc->gpbr) { dev_err(&pdev->dev, "failed to map gpbr registers, aborting.\n"); - ret = -ENOMEM; - goto fail; + return -ENOMEM; } mr = rtt_readl(rtc, MR); @@ -350,17 +348,15 @@ static int at91_rtc_probe(struct platform_device *pdev) rtc->rtcdev = devm_rtc_device_register(&pdev->dev, pdev->name, &at91_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc->rtcdev)) { - ret = PTR_ERR(rtc->rtcdev); - goto fail; - } + if (IS_ERR(rtc->rtcdev)) + return PTR_ERR(rtc->rtcdev); /* register irq handler after we know what name we'll use */ ret = devm_request_irq(&pdev->dev, rtc->irq, at91_rtc_interrupt, IRQF_SHARED, dev_name(&rtc->rtcdev->dev), rtc); if (ret) { dev_dbg(&pdev->dev, "can't share IRQ %d?\n", rtc->irq); - goto fail; + return ret; } /* NOTE: sam9260 rev A silicon has a ROM bug which resets the @@ -374,10 +370,6 @@ static int at91_rtc_probe(struct platform_device *pdev) dev_name(&rtc->rtcdev->dev)); return 0; - -fail: - platform_set_drvdata(pdev, NULL); - return ret; } /* @@ -391,7 +383,6 @@ static int at91_rtc_remove(struct platform_device *pdev) /* disable all interrupts */ rtt_writel(rtc, MR, mr & ~(AT91_RTT_ALMIEN | AT91_RTT_RTTINCIEN)); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From be546754626eb0561b7598835421ea0467a2e15d Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:42 +1000 Subject: rtc: rtc-au1xxx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-au1xxx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c index 7995abc391fc..7c6e7bb4cf00 100644 --- a/drivers/rtc/rtc-au1xxx.c +++ b/drivers/rtc/rtc-au1xxx.c @@ -118,8 +118,6 @@ out_err: static int au1xtoy_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 3192f71fee5825e9b4a3bab629693a089c9b0f0c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:42 +1000 Subject: rtc: rtc-bfin: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Acked-by: Mike Frysinger Signed-off-by: Andrew Morton --- drivers/rtc/rtc-bfin.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index ad44ec5dc29a..0c53f452849d 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -391,7 +391,6 @@ static int bfin_rtc_remove(struct platform_device *pdev) struct device *dev = &pdev->dev; bfin_rtc_reset(dev, 0); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From e2dc5c5cb955de43f402e6a436e97c2c01773041 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:43 +1000 Subject: rtc: rtc-bq4802: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-bq4802.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index af2886784a7b..b919168465f2 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -188,8 +188,6 @@ out: static int bq4802_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 6838e3bb29c53fbd3c481eb3f428a5df73fe6039 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:43 +1000 Subject: rtc: rtc-coh901331: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Acked-by: Linus Walleij Signed-off-by: Andrew Morton --- drivers/rtc/rtc-coh901331.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-coh901331.c b/drivers/rtc/rtc-coh901331.c index 93c06588ddca..9ad58914601d 100644 --- a/drivers/rtc/rtc-coh901331.c +++ b/drivers/rtc/rtc-coh901331.c @@ -154,10 +154,8 @@ static int __exit coh901331_remove(struct platform_device *pdev) { struct coh901331_port *rtap = dev_get_drvdata(&pdev->dev); - if (rtap) { + if (rtap) clk_unprepare(rtap->clk); - platform_set_drvdata(pdev, NULL); - } return 0; } @@ -220,7 +218,6 @@ static int __init coh901331_probe(struct platform_device *pdev) return 0; out_no_rtc: - platform_set_drvdata(pdev, NULL); clk_unprepare(rtap->clk); return ret; } -- cgit v1.2.3 From 5df9615eef85cf62e83dcd22a646be78a07366cb Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:43 +1000 Subject: rtc: rtc-da9052: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-da9052.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index 7286b279cf2d..a61ab7fa8a66 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -257,8 +257,6 @@ static int da9052_rtc_probe(struct platform_device *pdev) static int da9052_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 28ffd24dc6c6e34dbcf546bebe5e6b5b2b265547 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:44 +1000 Subject: rtc: rtc-da9055: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-da9055.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c index 73858ca9709a..df4cf16a967d 100644 --- a/drivers/rtc/rtc-da9055.c +++ b/drivers/rtc/rtc-da9055.c @@ -317,8 +317,6 @@ err_rtc: static int da9055_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 04a0e5a2cc656ac2e1d58bc9173cbb9a20f34eee Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:44 +1000 Subject: rtc: rtc-davinci: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-davinci.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index be5fc32d3cd7..24677ef8c39a 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -526,10 +526,9 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) davinci_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &davinci_rtc_ops, THIS_MODULE); if (IS_ERR(davinci_rtc->rtc)) { - ret = PTR_ERR(davinci_rtc->rtc); dev_err(dev, "unable to register RTC device, err %d\n", ret); - goto fail1; + return PTR_ERR(davinci_rtc->rtc); } rtcif_write(davinci_rtc, PRTCIF_INTFLG_RTCSS, PRTCIF_INTFLG); @@ -543,7 +542,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) 0, "davinci_rtc", davinci_rtc); if (ret < 0) { dev_err(dev, "unable to register davinci RTC interrupt\n"); - goto fail1; + return ret; } /* Enable interrupts */ @@ -556,10 +555,6 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 0); return 0; - -fail1: - platform_set_drvdata(pdev, NULL); - return ret; } static int __exit davinci_rtc_remove(struct platform_device *pdev) @@ -570,8 +565,6 @@ static int __exit davinci_rtc_remove(struct platform_device *pdev) rtcif_write(davinci_rtc, 0, PRTCIF_INTEN); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 8822ec9b5694e91283ed034efae3319e5fb9ce70 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:44 +1000 Subject: rtc: rtc-dm355evm: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-dm355evm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c index 1e1ca63d58a9..1855581014f7 100644 --- a/drivers/rtc/rtc-dm355evm.c +++ b/drivers/rtc/rtc-dm355evm.c @@ -141,7 +141,6 @@ static int dm355evm_rtc_probe(struct platform_device *pdev) static int dm355evm_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From 98ab3e920e8a173085abab63871bff0adfd4854c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:45 +1000 Subject: rtc: rtc-ds1302: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ds1302.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c index d13954346286..1f8ee639757b 100644 --- a/drivers/rtc/rtc-ds1302.c +++ b/drivers/rtc/rtc-ds1302.c @@ -236,8 +236,6 @@ static int __init ds1302_rtc_probe(struct platform_device *pdev) static int __exit ds1302_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From f821d755b641eada5b5eed8406f63d1ee0f7e0ef Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:45 +1000 Subject: rtc: rtc-ep93xx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ep93xx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 5807b77c444a..549b3c3792d2 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -167,7 +167,6 @@ static int ep93xx_rtc_probe(struct platform_device *pdev) return 0; exit: - platform_set_drvdata(pdev, NULL); pdev->dev.platform_data = NULL; return err; } @@ -175,7 +174,6 @@ exit: static int ep93xx_rtc_remove(struct platform_device *pdev) { sysfs_remove_group(&pdev->dev.kobj, &ep93xx_rtc_sysfs_files); - platform_set_drvdata(pdev, NULL); pdev->dev.platform_data = NULL; return 0; -- cgit v1.2.3 From 978eae7540ef0e7db90e254e6f41d805315c5d02 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:45 +1000 Subject: rtc: rtc-jz4740: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-jz4740.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 1e48686ca6d2..c6573585d028 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -287,7 +287,6 @@ err_free_irq: err_unregister_rtc: rtc_device_unregister(rtc->rtc); err_iounmap: - platform_set_drvdata(pdev, NULL); iounmap(rtc->base); err_release_mem_region: release_mem_region(rtc->mem->start, resource_size(rtc->mem)); @@ -310,8 +309,6 @@ static int jz4740_rtc_remove(struct platform_device *pdev) kfree(rtc); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From e4b2107d4ef16ff25bf282cb507a28f328f88896 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:46 +1000 Subject: rtc: rtc-lp8788: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-lp8788.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-lp8788.c b/drivers/rtc/rtc-lp8788.c index 9853ac15b296..76a82dc9e502 100644 --- a/drivers/rtc/rtc-lp8788.c +++ b/drivers/rtc/rtc-lp8788.c @@ -314,8 +314,6 @@ static int lp8788_rtc_probe(struct platform_device *pdev) static int lp8788_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 177a83c0b76d1247cae0d7d1f40136669eb30d61 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:46 +1000 Subject: rtc: rtc-lpc32xx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-lpc32xx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index 787550d756e9..8276ae94a2a9 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -277,7 +277,6 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) &lpc32xx_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { dev_err(&pdev->dev, "Can't get RTC\n"); - platform_set_drvdata(pdev, NULL); return PTR_ERR(rtc->rtc); } @@ -306,8 +305,6 @@ static int lpc32xx_rtc_remove(struct platform_device *pdev) if (rtc->irq >= 0) device_init_wakeup(&pdev->dev, 0); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 80432a50a17945f880bbca544e0d1cceabf54266 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:46 +1000 Subject: rtc: rtc-ls1x: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ls1x.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c index db82f91f4562..e913f38bd33d 100644 --- a/drivers/rtc/rtc-ls1x.c +++ b/drivers/rtc/rtc-ls1x.c @@ -187,8 +187,6 @@ err: static int ls1x_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 75cffc52fef723963be9584e962fee52b17d3531 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:46 +1000 Subject: rtc: rtc-m48t59: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-m48t59.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 130f29af3869..d4d31fa19244 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -513,7 +513,6 @@ static int m48t59_rtc_remove(struct platform_device *pdev) iounmap(m48t59->ioaddr); if (m48t59->irq != NO_IRQ) free_irq(m48t59->irq, &pdev->dev); - platform_set_drvdata(pdev, NULL); kfree(m48t59); return 0; } -- cgit v1.2.3 From 73b324220a1ce74e59db33146ab74a3a2288042a Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:47 +1000 Subject: rtc: rtc-max8925: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max8925.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-max8925.c b/drivers/rtc/rtc-max8925.c index 7c90f4e45e27..981b6544cf7c 100644 --- a/drivers/rtc/rtc-max8925.c +++ b/drivers/rtc/rtc-max8925.c @@ -268,7 +268,7 @@ static int max8925_rtc_probe(struct platform_device *pdev) if (ret < 0) { dev_err(chip->dev, "Failed to request IRQ: #%d: %d\n", info->irq, ret); - goto err; + return ret; } dev_set_drvdata(&pdev->dev, info); @@ -282,13 +282,10 @@ static int max8925_rtc_probe(struct platform_device *pdev) ret = PTR_ERR(info->rtc_dev); if (IS_ERR(info->rtc_dev)) { dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); - goto err; + return ret; } return 0; -err: - platform_set_drvdata(pdev, NULL); - return ret; } static int max8925_rtc_remove(struct platform_device *pdev) -- cgit v1.2.3 From a90a774ce19988c5b3639ad425d7666e83558551 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:47 +1000 Subject: rtc: rtc-max8998: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max8998.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c index d5af7baa48b5..738eecef4d90 100644 --- a/drivers/rtc/rtc-max8998.c +++ b/drivers/rtc/rtc-max8998.c @@ -274,7 +274,7 @@ static int max8998_rtc_probe(struct platform_device *pdev) if (IS_ERR(info->rtc_dev)) { ret = PTR_ERR(info->rtc_dev); dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); - goto out_rtc; + return ret; } ret = devm_request_threaded_irq(&pdev->dev, info->irq, NULL, @@ -292,10 +292,6 @@ static int max8998_rtc_probe(struct platform_device *pdev) } return 0; - -out_rtc: - platform_set_drvdata(pdev, NULL); - return ret; } static int max8998_rtc_remove(struct platform_device *pdev) -- cgit v1.2.3 From cca249254884c46d076cb006bf5655d5a1d45fb1 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:47 +1000 Subject: rtc: rtc-mc13xxx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-mc13xxx.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 7a8ed27a5f2e..77ea9896b5ba 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -370,8 +370,6 @@ err_reset_irq_status: err_reset_irq_request: mc13xxx_unlock(mc13xxx); - - platform_set_drvdata(pdev, NULL); } return ret; @@ -389,8 +387,6 @@ static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) mc13xxx_unlock(priv->mc13xxx); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From fd46f94a137df1305eee25a9418ac2145478c782 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:48 +1000 Subject: rtc: rtc-msm6242: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-msm6242.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c index f0e9893ae7be..c014fa2ee5e9 100644 --- a/drivers/rtc/rtc-msm6242.c +++ b/drivers/rtc/rtc-msm6242.c @@ -199,7 +199,6 @@ static int __init msm6242_rtc_probe(struct platform_device *pdev) struct resource *res; struct msm6242_priv *priv; struct rtc_device *rtc; - int error; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) @@ -216,17 +215,11 @@ static int __init msm6242_rtc_probe(struct platform_device *pdev) rtc = devm_rtc_device_register(&pdev->dev, "rtc-msm6242", &msm6242_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - error = PTR_ERR(rtc); - goto out_unmap; - } + if (IS_ERR(rtc)) + return PTR_ERR(rtc); priv->rtc = rtc; return 0; - -out_unmap: - platform_set_drvdata(pdev, NULL); - return error; } static int __exit msm6242_rtc_remove(struct platform_device *pdev) -- cgit v1.2.3 From 235ef105491cb85ed6bed2c3247e6aff480edae5 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:48 +1000 Subject: rtc: rtc-mxc: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-mxc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index bf00d6dbeadd..ab87bacb8f88 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -443,15 +443,13 @@ static int mxc_rtc_probe(struct platform_device *pdev) THIS_MODULE); if (IS_ERR(rtc)) { ret = PTR_ERR(rtc); - goto exit_clr_drvdata; + goto exit_put_clk; } pdata->rtc = rtc; return 0; -exit_clr_drvdata: - platform_set_drvdata(pdev, NULL); exit_put_clk: clk_disable_unprepare(pdata->clk); @@ -465,7 +463,6 @@ static int mxc_rtc_remove(struct platform_device *pdev) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); clk_disable_unprepare(pdata->clk); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From 5716fca9a49bb370ee4bb7867e5371b14af9574f Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:48 +1000 Subject: rtc: rtc-nuc900: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Acked-by: Wan Zongshun Signed-off-by: Andrew Morton --- drivers/rtc/rtc-nuc900.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index d592e2fe43f7..994659b0f879 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -262,8 +262,6 @@ static int __init nuc900_rtc_probe(struct platform_device *pdev) static int __exit nuc900_rtc_remove(struct platform_device *pdev) { - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 898e9ff094344ef2a54461c859bf4d3c84f19c88 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:49 +1000 Subject: rtc: rtc-pcap: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-pcap.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c index 539a90b98bc5..40b5c630bc7d 100644 --- a/drivers/rtc/rtc-pcap.c +++ b/drivers/rtc/rtc-pcap.c @@ -156,10 +156,8 @@ static int __init pcap_rtc_probe(struct platform_device *pdev) pcap_rtc->rtc = devm_rtc_device_register(&pdev->dev, "pcap", &pcap_rtc_ops, THIS_MODULE); - if (IS_ERR(pcap_rtc->rtc)) { - err = PTR_ERR(pcap_rtc->rtc); - goto fail; - } + if (IS_ERR(pcap_rtc->rtc)) + return PTR_ERR(pcap_rtc->rtc); timer_irq = pcap_to_irq(pcap_rtc->pcap, PCAP_IRQ_1HZ); alarm_irq = pcap_to_irq(pcap_rtc->pcap, PCAP_IRQ_TODA); @@ -167,17 +165,14 @@ static int __init pcap_rtc_probe(struct platform_device *pdev) err = devm_request_irq(&pdev->dev, timer_irq, pcap_rtc_irq, 0, "RTC Timer", pcap_rtc); if (err) - goto fail; + return err; err = devm_request_irq(&pdev->dev, alarm_irq, pcap_rtc_irq, 0, "RTC Alarm", pcap_rtc); if (err) - goto fail; + return err; return 0; -fail: - platform_set_drvdata(pdev, NULL); - return err; } static int __exit pcap_rtc_remove(struct platform_device *pdev) -- cgit v1.2.3 From 6ac1a82d1c15f7a7db903bb7b3c416bc68ca4c78 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:49 +1000 Subject: rtc: rtc-pm8xxx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-pm8xxx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index f1a6557261f3..14ee860d5a3f 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -480,7 +480,6 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) fail_req_irq: rtc_device_unregister(rtc_dd->rtc); fail_rtc_enable: - platform_set_drvdata(pdev, NULL); kfree(rtc_dd); return rc; } @@ -492,7 +491,6 @@ static int pm8xxx_rtc_remove(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 0); free_irq(rtc_dd->rtc_alarm_irq, rtc_dd); rtc_device_unregister(rtc_dd->rtc); - platform_set_drvdata(pdev, NULL); kfree(rtc_dd); return 0; -- cgit v1.2.3 From 8dc2cf31cf5f85010901dafabb733979ede8183b Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:49 +1000 Subject: rtc: rtc-s3c: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-s3c.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 0b495e8b8e66..7afd373b9595 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -421,8 +421,6 @@ static void s3c_rtc_enable(struct platform_device *pdev, int en) static int s3c_rtc_remove(struct platform_device *dev) { - platform_set_drvdata(dev, NULL); - s3c_rtc_setaie(&dev->dev, 0); clk_unprepare(rtc_clk); @@ -549,23 +547,20 @@ static int s3c_rtc_probe(struct platform_device *pdev) 0, "s3c2410-rtc alarm", rtc); if (ret) { dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_alarmno, ret); - goto err_alarm_irq; + goto err_nortc; } ret = devm_request_irq(&pdev->dev, s3c_rtc_tickno, s3c_rtc_tickirq, 0, "s3c2410-rtc tick", rtc); if (ret) { dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_tickno, ret); - goto err_alarm_irq; + goto err_nortc; } clk_disable(rtc_clk); return 0; - err_alarm_irq: - platform_set_drvdata(pdev, NULL); - err_nortc: s3c_rtc_enable(pdev, 0); clk_disable_unprepare(rtc_clk); -- cgit v1.2.3 From 93ac1b105e7a78e29e86adb608703371710b681f Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:50 +1000 Subject: rtc: rtc-sa1100: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-sa1100.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index 00605601dbf7..0f7adeb1944a 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -249,7 +249,7 @@ static int sa1100_rtc_probe(struct platform_device *pdev) ret = clk_prepare_enable(info->clk); if (ret) - goto err_enable_clk; + return ret; /* * According to the manual we should be able to let RTTR be zero * and then a default diviser for a 32.768KHz clock is used. @@ -303,8 +303,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev) return 0; err_dev: clk_disable_unprepare(info->clk); -err_enable_clk: - platform_set_drvdata(pdev, NULL); return ret; } @@ -312,10 +310,8 @@ static int sa1100_rtc_remove(struct platform_device *pdev) { struct sa1100_rtc *info = platform_get_drvdata(pdev); - if (info) { + if (info) clk_disable_unprepare(info->clk); - platform_set_drvdata(pdev, NULL); - } return 0; } -- cgit v1.2.3 From ff6ea43d16ac696f3f592849c4a62a8b0c69a247 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:50 +1000 Subject: rtc: rtc-sh: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-sh.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index 8d5bd2e36776..cb2f8399d692 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -770,8 +770,6 @@ static int __exit sh_rtc_remove(struct platform_device *pdev) clk_disable(rtc->clk); clk_put(rtc->clk); - platform_set_drvdata(pdev, NULL); - kfree(rtc); return 0; -- cgit v1.2.3 From f11887cae502b0700e0bf9e4296e75c48c8795ed Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:50 +1000 Subject: rtc: rtc-spear: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Acked-by: Viresh Kumar Signed-off-by: Andrew Morton --- drivers/rtc/rtc-spear.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index 574359c48f65..c492cf0ab8cd 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -417,7 +417,6 @@ static int spear_rtc_probe(struct platform_device *pdev) return 0; err_disable_clock: - platform_set_drvdata(pdev, NULL); clk_disable_unprepare(config->clk); return status; -- cgit v1.2.3 From a102b282decf216024b110fdf416a38e0bdd5e79 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:51 +1000 Subject: rtc: rtc-stmp3xxx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-stmp3xxx.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 483ce086990b..90a3e864b8fe 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -225,7 +225,6 @@ static int stmp3xxx_rtc_remove(struct platform_device *pdev) writel(STMP3XXX_RTC_CTRL_ALARM_IRQ_EN, rtc_data->io + STMP3XXX_RTC_CTRL_CLR); - platform_set_drvdata(pdev, NULL); return 0; } @@ -274,25 +273,19 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev) rtc_data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &stmp3xxx_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc_data->rtc)) { - err = PTR_ERR(rtc_data->rtc); - goto out; - } + if (IS_ERR(rtc_data->rtc)) + return PTR_ERR(rtc_data->rtc); err = devm_request_irq(&pdev->dev, rtc_data->irq_alarm, stmp3xxx_rtc_interrupt, 0, "RTC alarm", &pdev->dev); if (err) { dev_err(&pdev->dev, "Cannot claim IRQ%d\n", rtc_data->irq_alarm); - goto out; + return err; } stmp3xxx_wdt_register(pdev); return 0; - -out: - platform_set_drvdata(pdev, NULL); - return err; } #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 8b606bfc2c02ad61624ae2e8b3b2c33f5a60db00 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:51 +1000 Subject: rtc: rtc-twl: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-twl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index 8751a5240c99..103fea21e73c 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -555,7 +555,6 @@ static int twl_rtc_remove(struct platform_device *pdev) free_irq(irq, rtc); rtc_device_unregister(rtc); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3 From f153029f0899e7dd6260e9b241b27da7386a66f2 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:51 +1000 Subject: rtc: rtc-vr41xx: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-vr41xx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index b940c2374904..3b5b4fa9a6e9 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -381,8 +381,6 @@ static int rtc_remove(struct platform_device *pdev) if (rtc) rtc_device_unregister(rtc); - platform_set_drvdata(pdev, NULL); - free_irq(aie_irq, pdev); free_irq(pie_irq, pdev); if (rtc1_base) -- cgit v1.2.3 From dc969b2a0f110283b204a8a79e87fa4b3a198f7f Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:52 +1000 Subject: rtc: rtc-vt8500: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Acked-by: Tony Prisk Signed-off-by: Andrew Morton --- drivers/rtc/rtc-vt8500.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c index d89efee6d29e..c2d6331fc712 100644 --- a/drivers/rtc/rtc-vt8500.c +++ b/drivers/rtc/rtc-vt8500.c @@ -282,8 +282,6 @@ static int vt8500_rtc_remove(struct platform_device *pdev) /* Disable alarm matching */ writel(0, vt8500_rtc->regbase + VT8500_RTC_IS); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3 From 8d78e88ec0ac573bcf709b8cf59f2fd82e40034d Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:52 +1000 Subject: rtc: rtc-m48t86: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-m48t86.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index 33a91c484533..d1fe7f5cd20a 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -168,8 +168,6 @@ static int m48t86_rtc_probe(struct platform_device *dev) static int m48t86_rtc_remove(struct platform_device *dev) { - platform_set_drvdata(dev, NULL); - return 0; } -- cgit v1.2.3 From 487a939623b265252bb894c6b0b1deb77f713b02 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:52 +1000 Subject: rtc: rtc-puv3: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Acked-by: Guan Xuetao Signed-off-by: Andrew Morton --- drivers/rtc/rtc-puv3.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-puv3.c b/drivers/rtc/rtc-puv3.c index 72f437170d2e..402732cfb32a 100644 --- a/drivers/rtc/rtc-puv3.c +++ b/drivers/rtc/rtc-puv3.c @@ -224,7 +224,6 @@ static int puv3_rtc_remove(struct platform_device *dev) { struct rtc_device *rtc = platform_get_drvdata(dev); - platform_set_drvdata(dev, NULL); rtc_device_unregister(rtc); puv3_rtc_setpie(&dev->dev, 0); -- cgit v1.2.3 From ef6620b0addf55dc120b24510e3c8f1b917190dc Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:53 +1000 Subject: rtc: rtc-rp5c01: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-rp5c01.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c index 873c689f01c3..89d073679267 100644 --- a/drivers/rtc/rtc-rp5c01.c +++ b/drivers/rtc/rtc-rp5c01.c @@ -251,21 +251,15 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev) rtc = devm_rtc_device_register(&dev->dev, "rtc-rp5c01", &rp5c01_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - error = PTR_ERR(rtc); - goto out; - } + if (IS_ERR(rtc)) + return PTR_ERR(rtc); priv->rtc = rtc; error = sysfs_create_bin_file(&dev->dev.kobj, &priv->nvram_attr); if (error) - goto out; + return error; return 0; - -out: - platform_set_drvdata(dev, NULL); - return error; } static int __exit rp5c01_rtc_remove(struct platform_device *dev) -- cgit v1.2.3 From 771b0c99630c932a8acbcb76f1b8aa324f6bbf56 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 23 May 2013 10:37:53 +1000 Subject: rtc: rtc-tile: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d063100 ("device-core: Ensure drvdata = NULL when no driver is bound"). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-tile.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-tile.c b/drivers/rtc/rtc-tile.c index fc3dee95f166..b3bb32c43254 100644 --- a/drivers/rtc/rtc-tile.c +++ b/drivers/rtc/rtc-tile.c @@ -96,8 +96,6 @@ static int tile_rtc_probe(struct platform_device *dev) */ static int tile_rtc_remove(struct platform_device *dev) { - platform_set_drvdata(dev, NULL); - return 0; } -- cgit v1.2.3 From 17a95097deed52545d7491d70dd2797222855174 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Thu, 23 May 2013 10:37:53 +1000 Subject: drivers/rtc/rtc-rv3029c2.c: fix disabling AIE irq In the disable AIE irq code path, current code passes "1" to enable parameter of rv3029c2_rtc_i2c_alarm_set_irq(). Thus it does not disable AIE irq. Signed-off-by: Axel Lin Acked-by: Heiko Schocher Cc: Signed-off-by: Andrew Morton --- drivers/rtc/rtc-rv3029c2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c index 5032c24ec159..9100a3401de1 100644 --- a/drivers/rtc/rtc-rv3029c2.c +++ b/drivers/rtc/rtc-rv3029c2.c @@ -310,7 +310,7 @@ static int rv3029c2_rtc_i2c_set_alarm(struct i2c_client *client, dev_dbg(&client->dev, "alarm IRQ armed\n"); } else { /* disable AIE irq */ - ret = rv3029c2_rtc_i2c_alarm_set_irq(client, 1); + ret = rv3029c2_rtc_i2c_alarm_set_irq(client, 0); if (ret) return ret; -- cgit v1.2.3 From bd4829c79dea3dbf4c5d9594675d8ecf89d71bc0 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:54 +1000 Subject: drivers/rtc/rtc-m48t86.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/rtc/rtc-m48t86.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index d1fe7f5cd20a..2d30314fa07f 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -166,18 +166,12 @@ static int m48t86_rtc_probe(struct platform_device *dev) return 0; } -static int m48t86_rtc_remove(struct platform_device *dev) -{ - return 0; -} - static struct platform_driver m48t86_rtc_platform_driver = { .driver = { .name = "rtc-m48t86", .owner = THIS_MODULE, }, .probe = m48t86_rtc_probe, - .remove = m48t86_rtc_remove, }; module_platform_driver(m48t86_rtc_platform_driver); -- cgit v1.2.3 From 2328a67b578e6239114fa9dd9754e6cb9a651c90 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:54 +1000 Subject: drivers/rtc/rtc-tile.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/rtc/rtc-tile.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/rtc/rtc-tile.c b/drivers/rtc/rtc-tile.c index b3bb32c43254..ff9632eb79f2 100644 --- a/drivers/rtc/rtc-tile.c +++ b/drivers/rtc/rtc-tile.c @@ -91,21 +91,12 @@ static int tile_rtc_probe(struct platform_device *dev) return 0; } -/* - * Device cleanup routine. - */ -static int tile_rtc_remove(struct platform_device *dev) -{ - return 0; -} - static struct platform_driver tile_rtc_platform_driver = { .driver = { .name = "rtc-tile", .owner = THIS_MODULE, }, .probe = tile_rtc_probe, - .remove = tile_rtc_remove, }; /* -- cgit v1.2.3 From fe355cb8e5c67f3673e1184994b855b0eab19282 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:54 +1000 Subject: drivers/rtc/rtc-nuc900.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: Wan ZongShun Signed-off-by: Andrew Morton --- drivers/rtc/rtc-nuc900.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index 994659b0f879..22861c5e0c59 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -260,13 +260,7 @@ static int __init nuc900_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit nuc900_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver nuc900_rtc_driver = { - .remove = __exit_p(nuc900_rtc_remove), .driver = { .name = "nuc900-rtc", .owner = THIS_MODULE, -- cgit v1.2.3 From f7b8c4b20a9e5f8661bbaec33b62623fb643e190 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:55 +1000 Subject: drivers/rtc/rtc-msm6242.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton --- drivers/rtc/rtc-msm6242.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c index c014fa2ee5e9..426cb5189daa 100644 --- a/drivers/rtc/rtc-msm6242.c +++ b/drivers/rtc/rtc-msm6242.c @@ -222,17 +222,11 @@ static int __init msm6242_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit msm6242_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver msm6242_rtc_driver = { .driver = { .name = "rtc-msm6242", .owner = THIS_MODULE, }, - .remove = __exit_p(msm6242_rtc_remove), }; module_platform_driver_probe(msm6242_rtc_driver, msm6242_rtc_probe); -- cgit v1.2.3 From 4d3aabe0daf311a1712d9a1d8c3d0a1d2efdbe50 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:55 +1000 Subject: drivers/rtc/rtc-max8998.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: Minkyu Kang Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max8998.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c index 738eecef4d90..5388336a2c4c 100644 --- a/drivers/rtc/rtc-max8998.c +++ b/drivers/rtc/rtc-max8998.c @@ -294,11 +294,6 @@ static int max8998_rtc_probe(struct platform_device *pdev) return 0; } -static int max8998_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static const struct platform_device_id max8998_rtc_id[] = { { "max8998-rtc", TYPE_MAX8998 }, { "lp3974-rtc", TYPE_LP3974 }, @@ -311,7 +306,6 @@ static struct platform_driver max8998_rtc_driver = { .owner = THIS_MODULE, }, .probe = max8998_rtc_probe, - .remove = max8998_rtc_remove, .id_table = max8998_rtc_id, }; -- cgit v1.2.3 From 21a9a60cd459bac7899fd643065f9fe4df725291 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:55 +1000 Subject: drivers/rtc/rtc-max8925.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: Haojian Zhuang Signed-off-by: Andrew Morton --- drivers/rtc/rtc-max8925.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-max8925.c b/drivers/rtc/rtc-max8925.c index 981b6544cf7c..951d1a78e190 100644 --- a/drivers/rtc/rtc-max8925.c +++ b/drivers/rtc/rtc-max8925.c @@ -288,11 +288,6 @@ static int max8925_rtc_probe(struct platform_device *pdev) return 0; } -static int max8925_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - #ifdef CONFIG_PM_SLEEP static int max8925_rtc_suspend(struct device *dev) { @@ -323,7 +318,6 @@ static struct platform_driver max8925_rtc_driver = { .pm = &max8925_rtc_pm_ops, }, .probe = max8925_rtc_probe, - .remove = max8925_rtc_remove, }; module_platform_driver(max8925_rtc_driver); -- cgit v1.2.3 From 6bda4e4c726a89abdb6916a992b9af23949bc291 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:56 +1000 Subject: drivers/rtc/rtc-ls1x.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: zhao zhang Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ls1x.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c index e913f38bd33d..682ecb094839 100644 --- a/drivers/rtc/rtc-ls1x.c +++ b/drivers/rtc/rtc-ls1x.c @@ -185,17 +185,11 @@ err: return ret; } -static int ls1x_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver ls1x_rtc_driver = { .driver = { .name = "ls1x-rtc", .owner = THIS_MODULE, }, - .remove = ls1x_rtc_remove, .probe = ls1x_rtc_probe, }; -- cgit v1.2.3 From 3c50b393f9075059c5cf1f06d2119179e750f0dd Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:56 +1000 Subject: drivers/rtc/rtc-lp8788.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Acked-by: Milo Kim Signed-off-by: Andrew Morton --- drivers/rtc/rtc-lp8788.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-lp8788.c b/drivers/rtc/rtc-lp8788.c index 76a82dc9e502..4ff6c73253b3 100644 --- a/drivers/rtc/rtc-lp8788.c +++ b/drivers/rtc/rtc-lp8788.c @@ -312,14 +312,8 @@ static int lp8788_rtc_probe(struct platform_device *pdev) return 0; } -static int lp8788_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver lp8788_rtc_driver = { .probe = lp8788_rtc_probe, - .remove = lp8788_rtc_remove, .driver = { .name = LP8788_DEV_RTC, .owner = THIS_MODULE, -- cgit v1.2.3 From 6fb31adcf80a8dfc632da227fae7888ff11a89b6 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:56 +1000 Subject: drivers/rtc/rtc-ds1302.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ds1302.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c index 1f8ee639757b..7533b723082d 100644 --- a/drivers/rtc/rtc-ds1302.c +++ b/drivers/rtc/rtc-ds1302.c @@ -234,17 +234,11 @@ static int __init ds1302_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit ds1302_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver ds1302_platform_driver = { .driver = { .name = DRV_NAME, .owner = THIS_MODULE, }, - .remove = __exit_p(ds1302_rtc_remove), }; module_platform_driver_probe(ds1302_platform_driver, ds1302_rtc_probe); -- cgit v1.2.3 From 6e92360e0c89991b59a169b275fda0ba63100dab Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:56 +1000 Subject: drivers/rtc/rtc-dm355evm.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/rtc/rtc-dm355evm.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c index 1855581014f7..1aca08394c47 100644 --- a/drivers/rtc/rtc-dm355evm.c +++ b/drivers/rtc/rtc-dm355evm.c @@ -139,18 +139,12 @@ static int dm355evm_rtc_probe(struct platform_device *pdev) return 0; } -static int dm355evm_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - /* * I2C is used to talk to the MSP430, but this platform device is * exposed by an MFD driver that manages I2C communications. */ static struct platform_driver rtc_dm355evm_driver = { .probe = dm355evm_rtc_probe, - .remove = dm355evm_rtc_remove, .driver = { .owner = THIS_MODULE, .name = "rtc-dm355evm", -- cgit v1.2.3 From 9a06c070ea3f3b9211970f142de64cf9b12b6aef Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:57 +1000 Subject: drivers/rtc/rtc-da9055.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: David Dajun Chen Signed-off-by: Andrew Morton --- drivers/rtc/rtc-da9055.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c index df4cf16a967d..e00642b61076 100644 --- a/drivers/rtc/rtc-da9055.c +++ b/drivers/rtc/rtc-da9055.c @@ -315,11 +315,6 @@ err_rtc: } -static int da9055_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - #ifdef CONFIG_PM /* Turn off the alarm if it should not be a wake source. */ static int da9055_rtc_suspend(struct device *dev) @@ -392,7 +387,6 @@ static const struct dev_pm_ops da9055_rtc_pm_ops = { static struct platform_driver da9055_rtc_driver = { .probe = da9055_rtc_probe, - .remove = da9055_rtc_remove, .driver = { .name = "da9055-rtc", .owner = THIS_MODULE, -- cgit v1.2.3 From a48960a426990d591437fd1559e1e810bbf5c904 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:57 +1000 Subject: drivers/rtc/rtc-da9052.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: David Dajun Chen Signed-off-by: Andrew Morton --- drivers/rtc/rtc-da9052.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index a61ab7fa8a66..2f7fdd77a6f0 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -255,14 +255,8 @@ static int da9052_rtc_probe(struct platform_device *pdev) return 0; } -static int da9052_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver da9052_rtc_driver = { .probe = da9052_rtc_probe, - .remove = da9052_rtc_remove, .driver = { .name = "da9052-rtc", .owner = THIS_MODULE, -- cgit v1.2.3 From 8264dd260ef5e1aa41d64c865dab4355224f92b2 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:57 +1000 Subject: drivers/rtc/rtc-bq4802.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton --- drivers/rtc/rtc-bq4802.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index b919168465f2..fc0ff87aa5df 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -186,11 +186,6 @@ out: } -static int bq4802_remove(struct platform_device *pdev) -{ - return 0; -} - /* work with hotplug and coldplug */ MODULE_ALIAS("platform:rtc-bq4802"); @@ -200,7 +195,6 @@ static struct platform_driver bq4802_driver = { .owner = THIS_MODULE, }, .probe = bq4802_probe, - .remove = bq4802_remove, }; module_platform_driver(bq4802_driver); -- cgit v1.2.3 From cd9e6244ac468ee74e10aa773be466e8aee88877 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:58 +1000 Subject: drivers/rtc/rtc-au1xxx.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: Manuel Lauss Signed-off-by: Andrew Morton --- drivers/rtc/rtc-au1xxx.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c index 7c6e7bb4cf00..ed526a192ce0 100644 --- a/drivers/rtc/rtc-au1xxx.c +++ b/drivers/rtc/rtc-au1xxx.c @@ -116,17 +116,11 @@ out_err: return ret; } -static int au1xtoy_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver au1xrtc_driver = { .driver = { .name = "rtc-au1xxx", .owner = THIS_MODULE, }, - .remove = au1xtoy_rtc_remove, }; module_platform_driver_probe(au1xrtc_driver, au1xtoy_rtc_probe); -- cgit v1.2.3 From 9060b017037e2725ff8931e56687153366918954 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 23 May 2013 10:37:58 +1000 Subject: drivers/rtc/rtc-ab3100.c: remove empty function After the switch to devm_ functions and the removal of rtc_device_unregister(), the 'remove' function does not do anything. Delete it. Signed-off-by: Sachin Kamat Cc: Linus Walleij Signed-off-by: Andrew Morton --- drivers/rtc/rtc-ab3100.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-ab3100.c b/drivers/rtc/rtc-ab3100.c index 572208d67688..ff435343ba9f 100644 --- a/drivers/rtc/rtc-ab3100.c +++ b/drivers/rtc/rtc-ab3100.c @@ -240,17 +240,11 @@ static int __init ab3100_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit ab3100_rtc_remove(struct platform_device *pdev) -{ - return 0; -} - static struct platform_driver ab3100_rtc_driver = { .driver = { .name = "ab3100-rtc", .owner = THIS_MODULE, }, - .remove = __exit_p(ab3100_rtc_remove), }; module_platform_driver_probe(ab3100_rtc_driver, ab3100_rtc_probe); -- cgit v1.2.3 From e93e12588c5a8d52352642fe43d8f82c0e8762a9 Mon Sep 17 00:00:00 2001 From: Alexander Holler Date: Thu, 23 May 2013 10:37:58 +1000 Subject: rtc: rtc-hid-sensor-time: allow full years (16bit) in HID reports The draft for HID-sensors (HUTRR39) currently doesn't define the range for the attribute year. Asking one of the authors revealed that full years (e.g. 2013 instead of just 13) were meant. So we now allow both, 8 bit and 16 bit values for the attribute year and assuming full years when the value is 16 bits wide. We will still support 8 bit values until the specification gets final (and maybe defines a way to set the time too). Signed-off-by: Alexander Holler Cc: Alessandro Zummo Cc: Lars-Peter Clausen Cc: Jonathan Cameron Cc: Jiri Kosina Cc: John Stultz Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-hid-sensor-time.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 63024505dddc..b8c1b106a0b7 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -85,10 +85,13 @@ static int hid_time_capture_sample(struct hid_sensor_hub_device *hsdev, switch (usage_id) { case HID_USAGE_SENSOR_TIME_YEAR: - time_buf->tm_year = *(u8 *)raw_data; - if (time_buf->tm_year < 70) - /* assume we are in 1970...2069 */ - time_buf->tm_year += 100; + if (raw_len == 1) { + time_buf->tm_year = *(u8 *)raw_data; + if (time_buf->tm_year < 70) + /* assume we are in 1970...2069 */ + time_buf->tm_year += 100; + } else + time_buf->tm_year = *(u16 *)raw_data-1900; break; case HID_USAGE_SENSOR_TIME_MONTH: /* sensor sending the month as 1-12, we need 0-11 */ @@ -151,11 +154,27 @@ static int hid_time_parse_report(struct platform_device *pdev, return -EINVAL; } if (time_state->info[i].size != 1) { - dev_err(&pdev->dev, - "attribute '%s' not 8 bits wide!\n", + /* + * The draft for HID-sensors (HUTRR39) currently + * doesn't define the range for the year attribute. + * Therefor we support both 8 bit (0-99) and 16 bit + * (full) as size for the year. + */ + if (time_state->info[i].attrib_id != + HID_USAGE_SENSOR_TIME_YEAR) { + dev_err(&pdev->dev, + "attribute '%s' not 8 bits wide!\n", hid_time_attrib_name( time_state->info[i].attrib_id)); - return -EINVAL; + return -EINVAL; + } + if (time_state->info[i].size != 2) { + dev_err(&pdev->dev, + "attribute '%s' not 8 or 16 bits wide!\n", + hid_time_attrib_name( + time_state->info[i].attrib_id)); + return -EINVAL; + } } if (time_state->info[i].units != HID_USAGE_SENSOR_UNITS_NOT_SPECIFIED && -- cgit v1.2.3 From f463fa2ca35c9096d65b5d47ad116364bbfaf1b5 Mon Sep 17 00:00:00 2001 From: Alexander Holler Date: Thu, 23 May 2013 10:37:59 +1000 Subject: rtc: rtc-hid-sensor-time: allow 16 and 32 bit values for all attributes. There is no real reason to not support 16 or 32 bit values too. Signed-off-by: Alexander Holler Cc: Alessandro Zummo Cc: Lars-Peter Clausen Cc: Jonathan Cameron Cc: Jiri Kosina Cc: John Stultz Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-hid-sensor-time.c | 59 +++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index b8c1b106a0b7..7273b0139e5c 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -76,6 +76,20 @@ static int hid_time_proc_event(struct hid_sensor_hub_device *hsdev, return 0; } +static u32 hid_time_value(size_t raw_len, char *raw_data) +{ + switch (raw_len) { + case 1: + return *(u8 *)raw_data; + case 2: + return *(u16 *)raw_data; + case 4: + return *(u32 *)raw_data; + default: + return (u32)(~0U); /* 0xff... or -1 to denote an error */ + } +} + static int hid_time_capture_sample(struct hid_sensor_hub_device *hsdev, unsigned usage_id, size_t raw_len, char *raw_data, void *priv) @@ -85,29 +99,35 @@ static int hid_time_capture_sample(struct hid_sensor_hub_device *hsdev, switch (usage_id) { case HID_USAGE_SENSOR_TIME_YEAR: + /* + * The draft for HID-sensors (HUTRR39) currently doesn't define + * the range for the year attribute. Therefor we support + * 8 bit (0-99) and 16 or 32 bits (full) as size for the year. + */ if (raw_len == 1) { time_buf->tm_year = *(u8 *)raw_data; if (time_buf->tm_year < 70) /* assume we are in 1970...2069 */ time_buf->tm_year += 100; } else - time_buf->tm_year = *(u16 *)raw_data-1900; + time_buf->tm_year = + (int)hid_time_value(raw_len, raw_data)-1900; break; case HID_USAGE_SENSOR_TIME_MONTH: - /* sensor sending the month as 1-12, we need 0-11 */ - time_buf->tm_mon = *(u8 *)raw_data-1; + /* sensors are sending the month as 1-12, we need 0-11 */ + time_buf->tm_mon = (int)hid_time_value(raw_len, raw_data)-1; break; case HID_USAGE_SENSOR_TIME_DAY: - time_buf->tm_mday = *(u8 *)raw_data; + time_buf->tm_mday = (int)hid_time_value(raw_len, raw_data); break; case HID_USAGE_SENSOR_TIME_HOUR: - time_buf->tm_hour = *(u8 *)raw_data; + time_buf->tm_hour = (int)hid_time_value(raw_len, raw_data); break; case HID_USAGE_SENSOR_TIME_MINUTE: - time_buf->tm_min = *(u8 *)raw_data; + time_buf->tm_min = (int)hid_time_value(raw_len, raw_data); break; case HID_USAGE_SENSOR_TIME_SECOND: - time_buf->tm_sec = *(u8 *)raw_data; + time_buf->tm_sec = (int)hid_time_value(raw_len, raw_data); break; default: return -EINVAL; @@ -153,28 +173,13 @@ static int hid_time_parse_report(struct platform_device *pdev, "not all needed attributes inside the same report!\n"); return -EINVAL; } - if (time_state->info[i].size != 1) { - /* - * The draft for HID-sensors (HUTRR39) currently - * doesn't define the range for the year attribute. - * Therefor we support both 8 bit (0-99) and 16 bit - * (full) as size for the year. - */ - if (time_state->info[i].attrib_id != - HID_USAGE_SENSOR_TIME_YEAR) { - dev_err(&pdev->dev, - "attribute '%s' not 8 bits wide!\n", - hid_time_attrib_name( - time_state->info[i].attrib_id)); - return -EINVAL; - } - if (time_state->info[i].size != 2) { - dev_err(&pdev->dev, - "attribute '%s' not 8 or 16 bits wide!\n", + if (time_state->info[i].size == 3 || + time_state->info[i].size > 4) { + dev_err(&pdev->dev, + "attribute '%s' not 8, 16 or 32 bits wide!\n", hid_time_attrib_name( time_state->info[i].attrib_id)); - return -EINVAL; - } + return -EINVAL; } if (time_state->info[i].units != HID_USAGE_SENSOR_UNITS_NOT_SPECIFIED && -- cgit v1.2.3 From 1c0df1aa7244ebdeae58874d7fed5f8ae61fae53 Mon Sep 17 00:00:00 2001 From: Alexander Holler Date: Thu, 23 May 2013 10:37:59 +1000 Subject: rtc: rtc-hid-sensor-time: add option hctosys to set time at boot drivers/rtc/hctosys (CONFIG_RTC_HCTOSYS) doesn't work for rtc-hid-sensor-time because it will be called in late_init, and thus before rtc-hid-sensor-time gets loaded. To set the time through rtc-hid-sensor-time at startup, the module now checks by default if the system time is before 1970-01-02 and sets the system time (once) if this is the case. To disable this behaviour, set the module option hctosys to zero, e.g. by using rtc-hid-sensor-time.hctosys=0 at the kernel command line if the driver is statically linked into the kernel. Signed-off-by: Alexander Holler Cc: Alessandro Zummo Cc: Lars-Peter Clausen Cc: Jonathan Cameron Cc: Jiri Kosina Cc: John Stultz Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-hid-sensor-time.c | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 7273b0139e5c..db8503a47be7 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -27,6 +27,12 @@ /* Usage ID from spec for Time: 0x2000A0 */ #define DRIVER_NAME "HID-SENSOR-2000a0" /* must be lowercase */ +static bool hid_time_hctosys_enabled = 1; +module_param_named(hctosys, hid_time_hctosys_enabled, bool, 0644); +MODULE_PARM_DESC(hctosys, + "set the system time (once) if it is before 1970-01-02"); +static bool hid_time_time_set_once; + enum hid_time_channel { CHANNEL_SCAN_INDEX_YEAR, CHANNEL_SCAN_INDEX_MONTH, @@ -37,6 +43,11 @@ enum hid_time_channel { TIME_RTC_CHANNEL_MAX, }; +struct hid_time_workts { + struct work_struct work; + struct hid_time_state *time_state; +}; + struct hid_time_state { struct hid_sensor_hub_callbacks callbacks; struct hid_sensor_common common_attributes; @@ -46,6 +57,7 @@ struct hid_time_state { struct completion comp_last_time; struct rtc_time time_buf; struct rtc_device *rtc; + struct hid_time_workts *workts; }; static const u32 hid_time_addresses[TIME_RTC_CHANNEL_MAX] = { @@ -62,6 +74,49 @@ static const char * const hid_time_channel_names[TIME_RTC_CHANNEL_MAX] = { "year", "month", "day", "hour", "minute", "second", }; +static void hid_time_hctosys(struct hid_time_state *time_state) +{ + struct timespec ts; + int rc = rtc_valid_tm(&time_state->last_time); + + if (rc) { + dev_err(time_state->rtc->dev.parent, + "hctosys: invalid date/time\n"); + return; + } + + getnstimeofday(&ts); + if (ts.tv_sec >= 86400) /* 1970-01-02 00:00:00 UTC */ + /* + * Security: don't let a hot-pluggable device change + * a valid system time. + * In absence of a kernel-wide flag like 'systime_was_set', + * we check if the system time is earlier than 1970-01-02. + * Just using a flag inside the RTC subsystem (e.g. from + * hctosys) doesn't work, because the time could be set + * by userspace (NTP, date, user, ...) or something else + * in the kernel too. + * Using a whole day to decide if something else has set the + * time should be enough for even the longest boot to complete + * (including possible forced fscks) and load this module. + */ + return; + + ts.tv_nsec = NSEC_PER_SEC >> 1; + rtc_tm_to_time(&time_state->last_time, &ts.tv_sec); + rc = do_settimeofday(&ts); + dev_info(time_state->rtc->dev.parent, + "hctosys: setting system clock to " + "%d-%02d-%02d %02d:%02d:%02d UTC (%u)\n", + time_state->last_time.tm_year + 1900, + time_state->last_time.tm_mon + 1, + time_state->last_time.tm_mday, + time_state->last_time.tm_hour, + time_state->last_time.tm_min, + time_state->last_time.tm_sec, + (unsigned int) ts.tv_sec); +} + /* Callback handler to send event after all samples are received and captured */ static int hid_time_proc_event(struct hid_sensor_hub_device *hsdev, unsigned usage_id, void *priv) @@ -73,6 +128,10 @@ static int hid_time_proc_event(struct hid_sensor_hub_device *hsdev, time_state->last_time = time_state->time_buf; spin_unlock_irqrestore(&time_state->lock_last_time, flags); complete(&time_state->comp_last_time); + if (unlikely(!hid_time_time_set_once && hid_time_hctosys_enabled)) { + hid_time_time_set_once = 1; + hid_time_hctosys(time_state); + } return 0; } @@ -237,6 +296,19 @@ static const struct rtc_class_ops hid_time_rtc_ops = { .read_time = hid_rtc_read_time, }; +static void hid_time_request_report_work(struct work_struct *work) +{ + struct hid_time_state *time_state = + container_of(work, struct hid_time_workts, work) + ->time_state; + /* get a report with all values through requesting one value */ + sensor_hub_input_attr_get_raw_value( + time_state->common_attributes.hsdev, HID_USAGE_SENSOR_TIME, + hid_time_addresses[0], time_state->info[0].report_id); + time_state->workts = NULL; + kfree(work); +} + static int hid_time_probe(struct platform_device *pdev) { int ret = 0; @@ -288,13 +360,35 @@ static int hid_time_probe(struct platform_device *pdev) return PTR_ERR(time_state->rtc); } + if (!hid_time_time_set_once && hid_time_hctosys_enabled) { + /* + * Request a HID report to set the time. + * Calling sensor_hub_..._get_raw_value() here directly + * doesn't work, therefor we have to use a work. + */ + time_state->workts = kmalloc(sizeof(struct hid_time_workts), + GFP_KERNEL); + if (time_state->workts == NULL) + return -ENOMEM; + time_state->workts->time_state = time_state; + INIT_WORK(&time_state->workts->work, + hid_time_request_report_work); + schedule_work(&time_state->workts->work); + } + return ret; } static int hid_time_remove(struct platform_device *pdev) { struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data; + struct hid_time_state *time_state = platform_get_drvdata(pdev); + if (time_state->workts) { + cancel_work_sync(&time_state->workts->work); + kfree(time_state->workts); + time_state->workts = NULL; + } sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME); return 0; -- cgit v1.2.3 From 0a072feaf79bb215f2cc9a2f152024f002a1be96 Mon Sep 17 00:00:00 2001 From: Alexander Holler Date: Thu, 23 May 2013 10:37:59 +1000 Subject: rtc: rtc-hid-sensor-time: add support for milliseconds If a device sends milliseconds too, the driver will use them if it sets the system clock at startup (through module option hctosys). Signed-off-by: Alexander Holler Cc: Alessandro Zummo Cc: Lars-Peter Clausen Cc: Jonathan Cameron Cc: Jiri Kosina Cc: John Stultz Cc: Jingoo Han Signed-off-by: Andrew Morton --- drivers/rtc/rtc-hid-sensor-time.c | 37 +++++++++++++++++++++++++++++++++---- include/linux/hid-sensor-ids.h | 1 + 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index db8503a47be7..4aca12d1d564 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -40,6 +40,7 @@ enum hid_time_channel { CHANNEL_SCAN_INDEX_HOUR, CHANNEL_SCAN_INDEX_MINUTE, CHANNEL_SCAN_INDEX_SECOND, + CHANNEL_SCAN_INDEX_MILLISECOND, /* optional */ TIME_RTC_CHANNEL_MAX, }; @@ -53,9 +54,11 @@ struct hid_time_state { struct hid_sensor_common common_attributes; struct hid_sensor_hub_attribute_info info[TIME_RTC_CHANNEL_MAX]; struct rtc_time last_time; + unsigned last_ms; /* == UINT_MAX to indicate ms aren't supported */ spinlock_t lock_last_time; struct completion comp_last_time; struct rtc_time time_buf; + unsigned ms_buf; struct rtc_device *rtc; struct hid_time_workts *workts; }; @@ -67,16 +70,18 @@ static const u32 hid_time_addresses[TIME_RTC_CHANNEL_MAX] = { HID_USAGE_SENSOR_TIME_HOUR, HID_USAGE_SENSOR_TIME_MINUTE, HID_USAGE_SENSOR_TIME_SECOND, + HID_USAGE_SENSOR_TIME_MILLISECOND, /* optional */ }; /* Channel names for verbose error messages */ static const char * const hid_time_channel_names[TIME_RTC_CHANNEL_MAX] = { - "year", "month", "day", "hour", "minute", "second", + "year", "month", "day", "hour", "minute", "second", "millisecond", }; static void hid_time_hctosys(struct hid_time_state *time_state) { struct timespec ts; + char msbuf[5]; int rc = rtc_valid_tm(&time_state->last_time); if (rc) { @@ -102,18 +107,25 @@ static void hid_time_hctosys(struct hid_time_state *time_state) */ return; - ts.tv_nsec = NSEC_PER_SEC >> 1; + if (time_state->last_ms != UINT_MAX) { + ts.tv_nsec = time_state->last_ms * NSEC_PER_MSEC; + snprintf(msbuf, sizeof(msbuf), ":%03d", time_state->last_ms); + } else { + ts.tv_nsec = NSEC_PER_SEC >> 1; + *msbuf = 0; + } rtc_tm_to_time(&time_state->last_time, &ts.tv_sec); rc = do_settimeofday(&ts); dev_info(time_state->rtc->dev.parent, "hctosys: setting system clock to " - "%d-%02d-%02d %02d:%02d:%02d UTC (%u)\n", + "%d-%02d-%02d %02d:%02d:%02d%s UTC (%u)\n", time_state->last_time.tm_year + 1900, time_state->last_time.tm_mon + 1, time_state->last_time.tm_mday, time_state->last_time.tm_hour, time_state->last_time.tm_min, time_state->last_time.tm_sec, + msbuf, (unsigned int) ts.tv_sec); } @@ -126,6 +138,8 @@ static int hid_time_proc_event(struct hid_sensor_hub_device *hsdev, spin_lock_irqsave(&time_state->lock_last_time, flags); time_state->last_time = time_state->time_buf; + if (time_state->last_ms != UINT_MAX) + time_state->last_ms = time_state->ms_buf; spin_unlock_irqrestore(&time_state->lock_last_time, flags); complete(&time_state->comp_last_time); if (unlikely(!hid_time_time_set_once && hid_time_hctosys_enabled)) { @@ -188,6 +202,9 @@ static int hid_time_capture_sample(struct hid_sensor_hub_device *hsdev, case HID_USAGE_SENSOR_TIME_SECOND: time_buf->tm_sec = (int)hid_time_value(raw_len, raw_data); break; + case HID_USAGE_SENSOR_TIME_MILLISECOND: + time_state->ms_buf = hid_time_value(raw_len, raw_data); + break; default: return -EINVAL; } @@ -214,12 +231,18 @@ static int hid_time_parse_report(struct platform_device *pdev, { int report_id, i; - for (i = 0; i < TIME_RTC_CHANNEL_MAX; ++i) + /* Check if all required attributes are available */ + for (i = 0; i < CHANNEL_SCAN_INDEX_MILLISECOND; ++i) if (sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT, usage_id, hid_time_addresses[i], &time_state->info[i]) < 0) return -EINVAL; + if (!sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT, + usage_id, hid_time_addresses[i], &time_state->info[i])) { + dev_info(&pdev->dev, "milliseconds supported\n"); + time_state->last_ms = 0; + } /* Check the (needed) attributes for sanity */ report_id = time_state->info[0].report_id; if (report_id < 0) { @@ -227,6 +250,10 @@ static int hid_time_parse_report(struct platform_device *pdev, return -EINVAL; } for (i = 0; i < TIME_RTC_CHANNEL_MAX; ++i) { + if (time_state->info[i].attrib_id == + HID_USAGE_SENSOR_TIME_MILLISECOND && + time_state->last_ms == UINT_MAX) + continue; if (time_state->info[i].report_id != report_id) { dev_err(&pdev->dev, "not all needed attributes inside the same report!\n"); @@ -319,6 +346,8 @@ static int hid_time_probe(struct platform_device *pdev) if (time_state == NULL) return -ENOMEM; + time_state->last_ms = UINT_MAX; + platform_set_drvdata(pdev, time_state); spin_lock_init(&time_state->lock_last_time); diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 6f24446e7669..b519c47c635b 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -74,6 +74,7 @@ #define HID_USAGE_SENSOR_TIME_HOUR 0x200525 #define HID_USAGE_SENSOR_TIME_MINUTE 0x200526 #define HID_USAGE_SENSOR_TIME_SECOND 0x200527 +#define HID_USAGE_SENSOR_TIME_MILLISECOND 0x200528 /* Units */ #define HID_USAGE_SENSOR_UNITS_NOT_SPECIFIED 0x00 -- cgit v1.2.3 From f585914a2e82b91eb20095404caa41b2d96c1c46 Mon Sep 17 00:00:00 2001 From: Bernie Thompson Date: Thu, 23 May 2013 10:38:00 +1000 Subject: rtc: add ability to push out an existing wakealarm using sysfs This adds the ability for the rtc sysfs code to handle += characters at the beginning of a wakealarm setting string. This will allow the user to attempt to push out an existing wakealarm by a provided amount. In the case that the += characters are provided but the alarm is not active -EINVAL is returned. his is useful, at least for my purposes in suspend/resume testing. The basic test goes something like: 1. Set a wake alarm from userspace 5 seconds in the future 2. Start the suspend process (echo mem > /sys/power/state) 3. After ~2.5 seconds if userspace is still running (using another thread to check this), move the wake alarm 5 more seconds If the "move" involves an unset of the wakealarm then there's a period of time where the system is midway through suspending but has no wake alarm. It will get stuck. We'd rather not remove the "move" since the idea is to avoid a cancelled suspend when the alarm fires _during_ suspend. It is difficult for the test to tell the difference between a suspend that was cancelled because the alarm fired too early and a suspend that was Signed-off-by: Bernie Thompson Cc: Alessandro Zummo Cc: Doug Anderson Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- Documentation/rtc.txt | 7 ++++--- drivers/rtc/rtc-sysfs.c | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt index 32aa4002de4a..596b60c08b74 100644 --- a/Documentation/rtc.txt +++ b/Documentation/rtc.txt @@ -153,9 +153,10 @@ since_epoch: The number of seconds since the epoch according to the RTC time: RTC-provided time wakealarm: The time at which the clock will generate a system wakeup event. This is a one shot wakeup event, so must be reset - after wake if a daily wakeup is required. Format is either - seconds since the epoch or, if there's a leading +, seconds - in the future. + after wake if a daily wakeup is required. Format is seconds since + the epoch by default, or if there's a leading +, seconds in the + future, or if there is a leading +=, seconds ahead of the current + alarm. IOCTL INTERFACE --------------- diff --git a/drivers/rtc/rtc-sysfs.c b/drivers/rtc/rtc-sysfs.c index b70e2bb63645..4b26f8672b2d 100644 --- a/drivers/rtc/rtc-sysfs.c +++ b/drivers/rtc/rtc-sysfs.c @@ -164,6 +164,7 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr, { ssize_t retval; unsigned long now, alarm; + unsigned long push = 0; struct rtc_wkalrm alm; struct rtc_device *rtc = to_rtc_device(dev); char *buf_ptr; @@ -180,13 +181,17 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr, buf_ptr = (char *)buf; if (*buf_ptr == '+') { buf_ptr++; - adjust = 1; + if (*buf_ptr == '=') { + buf_ptr++; + push = 1; + } else + adjust = 1; } alarm = simple_strtoul(buf_ptr, NULL, 0); if (adjust) { alarm += now; } - if (alarm > now) { + if (alarm > now || push) { /* Avoid accidentally clobbering active alarms; we can't * entirely prevent that here, without even the minimal * locking from the /dev/rtcN api. @@ -194,9 +199,14 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr, retval = rtc_read_alarm(rtc, &alm); if (retval < 0) return retval; - if (alm.enabled) - return -EBUSY; - + if (alm.enabled) { + if (push) { + rtc_tm_to_time(&alm.time, &push); + alarm += push; + } else + return -EBUSY; + } else if (push) + return -EINVAL; alm.enabled = 1; } else { alm.enabled = 0; -- cgit v1.2.3 From 0d67d3f529ee22bba204dd3932ab23f3fdb5903b Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 23 May 2013 10:38:00 +1000 Subject: drivers/rtc/rtc-tps6586x.c: device wakeup flags correction Use device_init_wakeup() instead of device_set_wakeup_capable() and move it before rtc dev registering. This fixes issue with alarmtimer that checks wakeup capability with device_may_wakeup() on device add. Signed-off-by: Dmitry Osipenko Cc: Laxman dewangan Cc: Venu Byravarasu Signed-off-by: Andrew Morton --- drivers/rtc/rtc-tps6586x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c index 459c2ffc95a6..426901cef14f 100644 --- a/drivers/rtc/rtc-tps6586x.c +++ b/drivers/rtc/rtc-tps6586x.c @@ -273,6 +273,8 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) return ret; } + device_init_wakeup(&pdev->dev, 1); + platform_set_drvdata(pdev, rtc); rtc->rtc = devm_rtc_device_register(&pdev->dev, dev_name(&pdev->dev), &tps6586x_rtc_ops, THIS_MODULE); @@ -292,7 +294,6 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) goto fail_rtc_register; } disable_irq(rtc->irq); - device_set_wakeup_capable(&pdev->dev, 1); return 0; fail_rtc_register: -- cgit v1.2.3 From 98ea3876320c6da9697daef3bd706ad687c9c7e0 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 23 May 2013 10:38:01 +1000 Subject: fs/fat: use fat_msg() to replace printk() in __fat_fs_error() Signed-off-by: Gu Zheng Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/misc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 359d307b5507..628e22a5a543 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); + fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } @@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { sb->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT-fs (%s): Filesystem has been " - "set read-only\n", sb->s_id); + fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); -- cgit v1.2.3 From 7205321836eac2f6a3c5577b9bc0c3c68ad886ce Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 23 May 2013 10:38:01 +1000 Subject: fat: additions to support fat_fallocate Implement preallocation via the fallocate syscall on VFAT partitions. With FALLOC_FL_KEEP_SIZE, there is no way to distinguish if the mismatch between i_size and no. of clusters allocated is a consequence of fallocate or just plain corruption. When a non fallocate aware (old) linux fat driver tries to write to such a file, it throws an error.Also, fsck detects this as inconsistency and truncates the prealloc'd blocks. To avoid this, as suggested by OGAWA, remove changes that make fallocate persistent across mounts and restrict lifetime of blocks from fallocate(2) to file release. Signed-off-by: Namjae Jeon Signed-off-by: Ravishankar N Signed-off-by: Amit Sahrawat Cc: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/file.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/fat/inode.c | 53 ++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 1 deletion(-) diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e50ddb..73264395f705 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -17,8 +17,11 @@ #include #include #include +#include #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -140,6 +143,22 @@ static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, static int fat_file_release(struct inode *inode, struct file *filp) { + + struct super_block *sb = inode->i_sb; + loff_t mmu_private_ideal; + + /* + * Release unwritten fallocated blocks on file release. + * Do this only when the last open file descriptor is closed. + */ + mutex_lock(&inode->i_mutex); + mmu_private_ideal = round_up(inode->i_size, sb->s_blocksize); + + if (mmu_private_ideal < MSDOS_I(inode)->mmu_private && + filp->f_dentry->d_count == 1) + fat_truncate_blocks(inode, inode->i_size); + mutex_unlock(&inode->i_mutex); + if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); @@ -174,6 +193,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -212,6 +232,88 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. The + * allocated clusters are freed in fat_file_release(). + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int cluster, fclus, dclus; + int nr_cluster; /* Number of clusters to be allocated */ + loff_t nr_bytes; /* Number of bytes to be allocated*/ + loff_t free_bytes; /* Unused bytes in the last cluster of file*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if ((offset + len) <= MSDOS_I(inode)->mmu_private) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): Blocks already allocated"); + err = -EINVAL; + goto error; + } + + if (mode & FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + if (inode->i_size > 0) { + err = fat_get_cluster(inode, FAT_ENT_EOF, + &fclus, &dclus); + if (err < 0) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_get_cluster() error"); + goto error; + } + free_bytes = ((fclus + 1) << sbi->cluster_bits) - + inode->i_size; + nr_bytes = offset + len - inode->i_size - free_bytes; + MSDOS_I(inode)->mmu_private = (fclus + 1) << + sbi->cluster_bits; + } else + nr_bytes = offset + len - inode->i_size; + + nr_cluster = (nr_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_alloc_clusters() error"); + goto error; + } + err = fat_chain_add(inode, cluster, 1); + if (err) { + fat_free_clusters(inode, cluster); + goto error; + } + MSDOS_I(inode)->mmu_private += sbi->cluster_size; + } + } else { + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_cont_expand() error"); + } + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -378,6 +480,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; unsigned int ia_valid; int error; + loff_t mmu_private_ideal; + + mmu_private_ideal = round_up(inode->i_size, dentry->d_sb->s_blocksize); /* Check for setting the inode time. */ ia_valid = attr->ia_valid; @@ -403,7 +508,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { inode_dio_wait(inode); - if (attr->ia_size > inode->i_size) { + if (attr->ia_size > inode->i_size && + MSDOS_I(inode)->mmu_private <= mmu_private_ideal) { error = fat_cont_expand(inode, attr->ia_size); if (error || attr->ia_valid == ATTR_SIZE) goto out; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5d4513cb1b3c..99a76c274ff5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -152,11 +152,64 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } } +static int fat_zero_falloc_area(struct file *file, + struct address_space *mapping, loff_t pos) +{ + struct page *page; + struct inode *inode = mapping->host; + loff_t curpos = i_size_read(inode); + size_t count = pos - curpos; + int err; + + do { + unsigned offset, bytes; + void *fsdata; + + offset = (curpos & (PAGE_CACHE_SIZE - 1)); + bytes = PAGE_CACHE_SIZE - offset; + bytes = min(bytes, count); + + err = pagecache_write_begin(NULL, mapping, curpos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + break; + + zero_user(page, offset, bytes); + + err = pagecache_write_end(NULL, mapping, curpos, bytes, bytes, + page, fsdata); + if (err < 0) + break; + curpos += bytes; + count -= bytes; + err = 0; + } while (count); + + return err; +} + static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int err; + loff_t mmu_private_ideal, mmu_private_actual; + loff_t size; + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + + size = i_size_read(inode); + mmu_private_actual = MSDOS_I(inode)->mmu_private; + mmu_private_ideal = round_up(size, sb->s_blocksize); + if ((mmu_private_actual > mmu_private_ideal) && (pos > size)) { + err = fat_zero_falloc_area(file, mapping, pos); + if (err) { + fat_msg(sb, KERN_ERR, + "Error (%d) zeroing fallocated area", err); + return err; + } + } *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, flags, -- cgit v1.2.3 From e6123c54307e9d681aa86f1d786d0becdfc79193 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:38:01 +1000 Subject: fat-additions-to-support-fat_fallocate-fix fix min() warning Cc: Amit Sahrawat Cc: Namjae Jeon Cc: OGAWA Hirofumi Cc: Ravishankar N Reported-by: Wu Fengguang Signed-off-by: Andrew Morton --- fs/fat/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 99a76c274ff5..f49de9379bcd 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -162,7 +162,8 @@ static int fat_zero_falloc_area(struct file *file, int err; do { - unsigned offset, bytes; + unsigned offset; + size_t bytes; void *fsdata; offset = (curpos & (PAGE_CACHE_SIZE - 1)); -- cgit v1.2.3 From 463e7f5bacea8bc64171d96b1e353ba1ba864851 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 May 2013 10:38:02 +1000 Subject: Documentation/CodingStyle: allow multiple return statements per function A surprising number of newbies interpret this section to mean that only one return statement is allowed per function. Part of the problem is that the "one return statement per function" rule is an actual style guideline that people are used to from other projects. Signed-off-by: Dan Carpenter Cc: Eduardo Valentin Cc: Rob Landley Signed-off-by: Andrew Morton --- Documentation/CodingStyle | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index e00b8f0dde52..7fe0546c504a 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle @@ -389,7 +389,8 @@ Albeit deprecated by some people, the equivalent of the goto statement is used frequently by compilers in form of the unconditional jump instruction. The goto statement comes in handy when a function exits from multiple -locations and some common work such as cleanup has to be done. +locations and some common work such as cleanup has to be done. If there is no +cleanup needed then just return directly. The rationale is: -- cgit v1.2.3 From 63cd73b39b7ba3b5d97e3cf14ba5f8a5915ffcbe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:02 +1000 Subject: ptrace/x86: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit 87dc669ba257 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after 9899d11f ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. The patch only removes ptrace_get_breakpoints/ptrace_put_breakpoints and does a couple of "while at it" cleanups, it doesn't remove other changes from the reverted commit. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 29a8120e6fe8..7a98b21945aa 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -641,9 +641,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -692,9 +689,7 @@ restore: goto restore; } - ptrace_put_breakpoints(tsk); - - return ((orig_ret < 0) ? orig_ret : rc); + return orig_ret < 0 ? orig_ret : rc; } /* @@ -706,18 +701,10 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp; + struct perf_event *bp = thread->ptrace_bps[n]; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - bp = thread->ptrace_bps[n]; - if (!bp) - val = 0; - else + if (bp) val = bp->hw.info.address; - - ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -734,9 +721,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event_attr attr; int err = 0; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); /* @@ -762,7 +746,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, */ if (IS_ERR(bp)) { err = PTR_ERR(bp); - goto put; + goto out; } t->ptrace_bps[nr] = bp; @@ -773,9 +757,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } - -put: - ptrace_put_breakpoints(tsk); +out: return err; } -- cgit v1.2.3 From 3874e49266a5816ee4be1d81cf72de384d7d793a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:02 +1000 Subject: ptrace/powerpc: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit 07fa7a0a8a586 ("hw_breakpoints: Fix racy access to ptrace breakpoints") and removes ptrace_get/put_breakpoints() added by other commits. The patch was fine but we can no longer race with SIGKILL after 9899d11f ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/powerpc/kernel/ptrace.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 98c2fc198712..6645e573c44c 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -975,16 +975,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; hw_brk.len = 8; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(task) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) { if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } - ptrace_put_breakpoints(task); return 0; } if (bp) { @@ -997,11 +993,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ret = modify_user_hw_breakpoint(bp, &attr); if (ret) { - ptrace_put_breakpoints(task); return ret; } thread->ptrace_bps[0] = bp; - ptrace_put_breakpoints(task); thread->hw_brk = hw_brk; return 0; } @@ -1016,12 +1010,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(task); return PTR_ERR(bp); } - ptrace_put_breakpoints(task); - #endif /* CONFIG_HAVE_HW_BREAKPOINT */ task->thread.hw_brk = hw_brk; #else /* CONFIG_PPC_ADV_DEBUG_REGS */ @@ -1440,9 +1431,6 @@ static long ppc_set_hwdebug(struct task_struct *child, if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) brk.type |= HW_BRK_TYPE_WRITE; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - /* * Check if the request is for 'range' breakpoints. We can * support it if range < 8 bytes. @@ -1450,12 +1438,10 @@ static long ppc_set_hwdebug(struct task_struct *child, if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { len = bp_info->addr2 - bp_info->addr; } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { - ptrace_put_breakpoints(child); return -EINVAL; } bp = thread->ptrace_bps[0]; if (bp) { - ptrace_put_breakpoints(child); return -ENOSPC; } @@ -1469,11 +1455,9 @@ static long ppc_set_hwdebug(struct task_struct *child, ptrace_triggered, NULL, child); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(child); return PTR_ERR(bp); } - ptrace_put_breakpoints(child); return 1; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ @@ -1517,16 +1501,12 @@ static long ppc_del_hwdebug(struct task_struct *child, long data) return -EINVAL; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } else ret = -ENOENT; - ptrace_put_breakpoints(child); return ret; #else /* CONFIG_HAVE_HW_BREAKPOINT */ if (child->thread.hw_brk.address == 0) -- cgit v1.2.3 From 279780be8b5cb218a98491a9572bbabb34211d1f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:03 +1000 Subject: ptrace/arm: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit bf0b8f4b55e ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after 9899d11f ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Acked-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/arm/kernel/ptrace.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 03deeffd9f6d..41668e56685f 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -886,20 +886,12 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_HAVE_HW_BREAKPOINT case PTRACE_GETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_gethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; case PTRACE_SETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_sethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; #endif -- cgit v1.2.3 From 5459c9456c72812988189fbab755e686d2275475 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:03 +1000 Subject: ptrace/sh: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit e0ac8457d020c ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after 9899d11f ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/sh/kernel/ptrace_32.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 81f999a672f6..668c81631c08 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -117,11 +117,7 @@ void user_enable_single_step(struct task_struct *child) set_tsk_thread_flag(child, TIF_SINGLESTEP); - if (ptrace_get_breakpoints(child) < 0) - return; - set_single_step(child, pc); - ptrace_put_breakpoints(child); } void user_disable_single_step(struct task_struct *child) -- cgit v1.2.3 From 026431a774e62ee818df54413a9492844c1b0114 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:03 +1000 Subject: ptrace: revert "Prepare to fix racy accesses on task breakpoints" This reverts commit bf26c018490c2fce ("Prepare to fix racy accesses on task breakpoints"). The patch was fine but we can no longer race with SIGKILL after 9899d11f ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Now that ptrace_get_breakpoints/ptrace_put_breakpoints have no callers, we can kill them and remove task->ptrace_bp_refcnt. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- include/linux/ptrace.h | 10 ---------- include/linux/sched.h | 3 --- kernel/exit.c | 2 +- kernel/ptrace.c | 16 ---------------- 4 files changed, 1 insertion(+), 30 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 89573a33ab3c..07d0df6bf768 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -142,9 +142,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_set(&child->ptrace_bp_refcnt, 1); -#endif child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; @@ -351,11 +348,4 @@ extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); -#ifdef CONFIG_HAVE_HW_BREAKPOINT -extern int ptrace_get_breakpoints(struct task_struct *tsk); -extern void ptrace_put_breakpoints(struct task_struct *tsk); -#else -static inline void ptrace_put_breakpoints(struct task_struct *tsk) { } -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index b3612503ca4a..d7715b23e842 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1404,9 +1404,6 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; #endif -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_t ptrace_bp_refcnt; -#endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/kernel/exit.c b/kernel/exit.c index e59756275000..a2c19f24735d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -819,7 +819,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..8cc170b157d7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1179,19 +1179,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v1.2.3 From dd1e50f6926451ef586d1609a0534cadd59f8c0a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:04 +1000 Subject: ptrace/x86: simplify the "disable" logic in ptrace_write_dr7() ptrace_write_dr7() looks unnecessarily overcomplicated. We can factor out ptrace_modify_breakpoint() and do not do "continue" twice, just we need to pass the proper "disabled" argument to ptrace_modify_breakpoint(). Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7a98b21945aa..0649f166d7c6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -637,9 +637,7 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) struct thread_struct *thread = &(tsk->thread); unsigned long old_dr7; int i, orig_ret = 0, rc = 0; - int enabled, second_pass = 0; - unsigned len, type; - struct perf_event *bp; + int second_pass = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); @@ -649,30 +647,22 @@ restore: * appropriate changes to each. */ for (i = 0; i < HBP_NUM; i++) { - enabled = decode_dr7(data, i, &len, &type); - bp = thread->ptrace_bps[i]; - - if (!enabled) { - if (bp) { - /* - * Don't unregister the breakpoints right-away, - * unless all register_user_hw_breakpoint() - * requests have succeeded. This prevents - * any window of opportunity for debug - * register grabbing by other users. - */ - if (!second_pass) - continue; - - rc = ptrace_modify_breakpoint(bp, len, type, - tsk, 1); - if (rc) - break; - } - continue; + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (disabled) { + /* + * Don't unregister the breakpoints right-away, unless + * all register_user_hw_breakpoint() requests have + * succeeded. This prevents any window of opportunity + * for debug register grabbing by other users. + */ + if (!bp || !second_pass) + continue; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); if (rc) break; } -- cgit v1.2.3 From ed07c0b832a3890cdea6fdf973c8999fd4765e9a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:04 +1000 Subject: ptrace/x86: dont delay "disable" till second pass in ptrace_write_dr7() ptrace_write_dr7() skips ptrace_modify_breakpoint(disabled => true) unless second_pass, this buys nothing but complicates the code and means that we always do the main loop twice even if "disabled" was never true. The comment says: Don't unregister the breakpoints right-away, unless all register_user_hw_breakpoint() requests have succeeded. Firstly, we do not do register_user_hw_breakpoint(), it was removed by 24f1e32c ("hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events"). We are going to restore register_user_hw_breakpoint() (see the next patch) but this doesn't matter, after 44234adc "hw-breakpoints: Modify breakpoints without unregistering them" perf_event_disable() can not hurt, hw_breakpoint_del() does not free the slot. Remove the "second_pass" check from the main loop and simplify the code. Since we have to check "bp != NULL" anyway, the patch also removes the same check in ptrace_modify_breakpoint() and moves the comment into ptrace_write_dr7(). With this patch the second pass is only needed to restore the saved old_dr7. This should never fail, so the patch adds WARN_ON() to catch the potential problems as Frederic suggested. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 53 ++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0649f166d7c6..98b0a2ccc33c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -609,14 +609,6 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int gen_len, gen_type; struct perf_event_attr attr; - /* - * We should have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) - return -EINVAL; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) return err; @@ -634,52 +626,47 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; - int i, orig_ret = 0, rc = 0; - int second_pass = 0; + bool second_pass = false; + int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + restore: - /* - * Loop through all the hardware breakpoints, making the - * appropriate changes to each. - */ + rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; - if (disabled) { + if (!bp) { + if (disabled) + continue; /* - * Don't unregister the breakpoints right-away, unless - * all register_user_hw_breakpoint() requests have - * succeeded. This prevents any window of opportunity - * for debug register grabbing by other users. + * We should have at least an inactive breakpoint at + * this slot. It means the user is writing dr7 without + * having written the address register first. */ - if (!bp || !second_pass) - continue; + rc = -EINVAL; + break; } rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); if (rc) break; } - /* - * Make a second pass to free the remaining unused breakpoints - * or to restore the original breakpoints if an error occurred. - */ - if (!second_pass) { - second_pass = 1; - if (rc < 0) { - orig_ret = rc; - data = old_dr7; - } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; goto restore; } - return orig_ret < 0 ? orig_ret : rc; + return ret; } /* -- cgit v1.2.3 From 76a73b3d054594cfefe598ec04cc735bfe2b3f47 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:04 +1000 Subject: ptrace/x86: introduce ptrace_register_breakpoint() No functional changes, preparation. Extract the "register breakpoint" code from ptrace_get_debugreg() into the new/generic helper, ptrace_register_breakpoint(). It will have more users. The patch also adds another simple helper, ptrace_fill_bp_fields(), to factor out the arch_bp_generic_fields() logic in register/modify. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 86 ++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98b0a2ccc33c..052636801b41 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -601,22 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) { - int err; - int gen_len, gen_type; struct perf_event_attr attr; + int err; + + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) - return err; + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} - attr = bp->attr; - attr.bp_len = gen_len; - attr.bp_type = gen_type; - attr.disabled = disabled; +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; return modify_user_hw_breakpoint(bp, &attr); } @@ -653,7 +679,7 @@ restore: break; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); + rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } @@ -693,26 +719,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { - struct perf_event *bp; struct thread_struct *t = &tsk->thread; - struct perf_event_attr attr; + struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; - if (!t->ptrace_bps[nr]) { - ptrace_breakpoint_init(&attr); - /* - * Put stub len and type to register (reserve) an inactive but - * correct bp - */ - attr.bp_addr = addr; - attr.bp_len = HW_BREAKPOINT_LEN_1; - attr.bp_type = HW_BREAKPOINT_W; - attr.disabled = 1; - - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, - NULL, tsk); - + if (!bp) { /* + * Put stub len and type to create an inactive but correct bp. + * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. @@ -721,20 +735,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) { + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) err = PTR_ERR(bp); - goto out; - } - - t->ptrace_bps[nr] = bp; + else + t->ptrace_bps[nr] = bp; } else { - bp = t->ptrace_bps[nr]; + struct perf_event_attr attr = bp->attr; - attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } -out: + return err; } -- cgit v1.2.3 From fb7ad2ddae9fee502087115bcdb79c98724432e0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:05 +1000 Subject: ptrace/x86: ptrace_write_dr7() should create bp if !disabled 24f1e32c ("hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events") introduced the minor regression. Before this commit PTRACE_POKEUSER DR7, enableDR0 PTRACE_POKEUSER DR0, address was perfectly valid, now PTRACE_POKEUSER(DR7) fails if DR0 was not previously initialized by PTRACE_POKEUSER(DR0). Change ptrace_write_dr7() to do ptrace_register_breakpoint(addr => 0) if !bp && !disabled. This fixes watchpoint-zeroaddr from ptrace-tests, see https://bugzilla.redhat.com/show_bug.cgi?id=660204. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 052636801b41..5c387b3dce3f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -670,13 +670,16 @@ restore: if (!bp) { if (disabled) continue; - /* - * We should have at least an inactive breakpoint at - * this slot. It means the user is writing dr7 without - * having written the address register first. - */ - rc = -EINVAL; - break; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; + } + + thread->ptrace_bps[i] = bp; + continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); -- cgit v1.2.3 From f3f0e233b8d6c0817ff944cef803f6693a3df885 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:05 +1000 Subject: ptrace/x86: cleanup ptrace_set_debugreg() ptrace_set_debugreg() is trivial but looks horrible. Kill the unnecessary goto's and return's to cleanup the code. This matches ptrace_get_debugreg() which also needs the trivial whitespace cleanups. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5c387b3dce3f..7461f50d5bb1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -703,7 +703,7 @@ restore: */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { @@ -713,7 +713,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) val = bp->hw.info.address; } else if (n == 6) { val = thread->debugreg6; - } else if (n == 7) { + } else if (n == 7) { val = thread->ptrace_dr7; } return val; @@ -761,30 +761,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { - struct thread_struct *thread = &(tsk->thread); - int rc = 0; - + struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ - if (n == 4 || n == 5) - return -EIO; + int rc = -EIO; - if (n == 6) { - thread->debugreg6 = val; - goto ret_path; - } if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); - if (rc) - return rc; - } - /* All that's left is DR7 */ - if (n == 7) { + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } - -ret_path: return rc; } -- cgit v1.2.3 From 616f6a50417fb2e5a64d456798096638983e47e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:05 +1000 Subject: ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child) Change ptrace_detach() to call flush_ptrace_hw_breakpoint(child). This frees the slots for non-ptrace PERF_TYPE_BREAKPOINT users, and this ensures that the tracee won't be killed by SIGTRAP triggered by the active breakpoints. Test-case: unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len) { unsigned long dr7; dr7 = ((len | type) & 0xf) << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); if (enable) dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return dr7; } int write_dr(int pid, int dr, unsigned long val) { return ptrace(PTRACE_POKEUSER, pid, offsetof (struct user, u_debugreg[dr]), val); } void func(void) { } int main(void) { int pid, stat; unsigned long dr7; pid = fork(); if (!pid) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); kill(getpid(), SIGHUP); func(); return 0x13; } assert(pid == waitpid(-1, &stat, 0)); assert(WSTOPSIG(stat) == SIGHUP); assert(write_dr(pid, 0, (long)func) == 0); dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1); assert(write_dr(pid, 7, dr7) == 0); assert(ptrace(PTRACE_DETACH, pid, 0,0) == 0); assert(pid == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Before this patch the child is killed after PTRACE_DETACH. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8cc170b157d7..0ea7f225ba95 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* -- cgit v1.2.3 From 3e966591baad1c2656ce37760fceaefd26ec519a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:06 +1000 Subject: ptrace/x86: flush_ptrace_hw_breakpoint() shoule clear the virtual debug registers flush_ptrace_hw_breakpoint() destroys the counters set by ptrace, but "leaks" ->debugreg6 and ->ptrace_dr7. The problem is minor, but still it doesn't look right and flush_thread() did this until 66cb5917 ("hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code"). Now that PTRACE_DETACH does flush_ too this makes even more sense. Signed-off-by: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/kernel/hw_breakpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 02f07634d265..f66ff162dce8 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) -- cgit v1.2.3 From bc334e25ccf0dd25c389a72a47375df903bccd78 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 May 2013 10:38:06 +1000 Subject: x86: kill TIF_DEBUG Because it is not used. Signed-off-by: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton --- arch/x86/include/asm/thread_info.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1df6e84691f..27811190cbd7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,7 +89,6 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -113,7 +112,6 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) -#define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -154,7 +152,7 @@ struct thread_info { (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define PREEMPT_ACTIVE 0x10000000 -- cgit v1.2.3 From 5963859dbc88060fbd7be8471c4718bf69162c2f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 23 May 2013 10:38:06 +1000 Subject: fork: reorder permissions when violating number of processes limits When a task is attempting to violate the RLIMIT_NPROC limit we have a check to see if the task is sufficiently priviledged. The check first looks at CAP_SYS_ADMIN, then CAP_SYS_RESOURCE, then if the task is uid=0. A result is that tasks which are allowed by the uid=0 check are first checked against the security subsystem. This results in the security subsystem auditting a denial for sys_admin and sys_resource and then the task passing the uid=0 check. This patch rearranges the code to first check uid=0, since if we pass that we shouldn't hit the security system at all. We then check sys_resource, since it is the smallest capability which will solve the problem. Lastly we check the fallback everything cap_sysadmin. We don't want to give this capability many places since it is so powerful. This will eliminate many of the false positive/needless denial messages we get when a root task tries to violate the nproc limit. (note that kthreads count against root, so on a sufficiently large machine we can actually get past the default limits before any userspace tasks are launched.) Signed-off-by: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 6fcc2e24561e..7fe4203cfe68 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1195,8 +1195,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != INIT_USER) + if (p->real_cred->user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; -- cgit v1.2.3 From 6401e5a278c7d91ad621671abf22312166265bb0 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 23 May 2013 10:38:07 +1000 Subject: idr: print a stack dump after ida_remove warning We print a dump stack after idr_remove warning. This is useful to find the faulty piece of code. Let's do the same for ida_remove, as it would be equally useful there. Signed-off-by: Jean Delvare Cc: Tejun Heo Cc: Takashi Iwai Signed-off-by: Andrew Morton --- lib/idr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/idr.c b/lib/idr.c index cca4b9302a71..cee828827f60 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -1066,6 +1066,7 @@ void ida_remove(struct ida *ida, int id) err: printk(KERN_WARNING "ida_remove called for id=%d which is not allocated.\n", id); + dump_stack(); } EXPORT_SYMBOL(ida_remove); -- cgit v1.2.3 From 7a6bfa7513eeb0026d671873bd828ff869cbb0ae Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:38:07 +1000 Subject: idr-print-a-stack-dump-after-ida_remove-warning-fix convert the open-coded printk+dump_stack into WARN() Cc: Jean Delvare Cc: Takashi Iwai Cc: Tejun Heo Signed-off-by: Andrew Morton --- lib/idr.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index cee828827f60..bfe4db4e165f 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -524,9 +524,7 @@ EXPORT_SYMBOL(idr_alloc_cyclic); static void idr_remove_warning(int id) { - printk(KERN_WARNING - "idr_remove called for id=%d which is not allocated.\n", id); - dump_stack(); + WARN(1, "idr_remove called for id=%d which is not allocated.\n", id); } static void sub_remove(struct idr *idp, int shift, int id) @@ -1064,9 +1062,7 @@ void ida_remove(struct ida *ida, int id) return; err: - printk(KERN_WARNING - "ida_remove called for id=%d which is not allocated.\n", id); - dump_stack(); + WARN(1, "ida_remove called for id=%d which is not allocated.\n", id); } EXPORT_SYMBOL(ida_remove); -- cgit v1.2.3 From 10a02f995580faf2174381f77de1852fb4de7852 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 May 2013 10:38:08 +1000 Subject: mwave: fix info leak in mwave_ioctl() Smatch complains that on 64 bit systems, there is a hole in the MW_ABILITIES struct between ->component_count and ->component_list[]. It leaks stack information from the mwave_ioctl() function. I've added a memset() to initialize the struct to zero. Signed-off-by: Dan Carpenter Cc: Greg KH Cc: Jiri Kosina Signed-off-by: Andrew Morton --- drivers/char/mwave/tp3780i.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c index c68969708068..04e6d6a27994 100644 --- a/drivers/char/mwave/tp3780i.c +++ b/drivers/char/mwave/tp3780i.c @@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities PRINTK_2(TRACE_TP3780I, "tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData); + memset(pAbilities, 0, sizeof(*pAbilities)); /* fill out standard constant fields */ pAbilities->instr_per_sec = pBDData->rDspSettings.uIps; pAbilities->data_size = pBDData->rDspSettings.uDStoreSize; -- cgit v1.2.3 From 4f8fe98841b37a7b8b96c41a3134bf14a609e6a1 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 23 May 2013 10:38:08 +1000 Subject: partitions/msdos.c: end-of-line whitespace and semicolon cleanup Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/partitions/msdos.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 7681cd295ab8..9bf19e6fd949 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -90,7 +90,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); - }; + } return ret; } @@ -142,7 +142,7 @@ static void parse_extended(struct parsed_partitions *state, return; if (!msdos_magic_present(data + 510)) - goto done; + goto done; p = (struct partition *) (data + 0x1be); @@ -155,7 +155,7 @@ static void parse_extended(struct parsed_partitions *state, * and OS/2 seems to use all four entries. */ - /* + /* * First process the data partition(s) */ for (i=0; i<4; i++, p++) { @@ -263,7 +263,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, } #if defined(CONFIG_BSD_DISKLABEL) -/* +/* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ @@ -294,7 +294,7 @@ static void parse_bsd(struct parsed_partitions *state, if (state->next == state->limit) break; - if (p->p_fstype == BSD_FS_UNUSED) + if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); @@ -441,7 +441,7 @@ static struct { {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; - + int msdos_partition(struct parsed_partitions *state) { sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; -- cgit v1.2.3 From 3b8c056a332327a1d2500f2c63264fb6e67d9202 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 23 May 2013 10:38:08 +1000 Subject: partitions: add aix lvm partition support files Add partitions/aix.h and partitions/aix.c. AIX LVM permits to make "logical volumes" which are made of multiple slices of multiple disks. The new code allows only access to the "logical volumes" which are made of one slice on the probed disk, a slice being a contiguous disk area. The code also detects "logical volumes" made of multiple slices on the probed disk, but can not describe them to the partition layer, because the partition layer generic code does not support that. When such non-contiguous "logical volumes" are detected, a diagnostic message is printed. Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/partitions/aix.c | 290 +++++++++++++++++++++++++++++++++++++++++++++++++ block/partitions/aix.h | 1 + 2 files changed, 291 insertions(+) create mode 100644 block/partitions/aix.c create mode 100644 block/partitions/aix.h diff --git a/block/partitions/aix.c b/block/partitions/aix.c new file mode 100644 index 000000000000..b3288d24dacf --- /dev/null +++ b/block/partitions/aix.c @@ -0,0 +1,290 @@ +/* + * fs/partitions/aix.c + * + * Copyright (C) 2012-2013 Philippe De Muyter + */ + +#include "check.h" +#include "aix.h" + +struct lvm_rec { + char lvm_id[4]; /* "_LVM" */ + char reserved4[16]; + __be32 lvmarea_len; + __be32 vgda_len; + __be32 vgda_psn[2]; + char reserved36[10]; + __be16 pp_size; /* log2(pp_size) */ + char reserved46[12]; + __be16 version; + }; + +struct vgda { + __be32 secs; + __be32 usec; + char reserved8[16]; + __be16 numlvs; + __be16 maxlvs; + __be16 pp_size; + __be16 numpvs; + __be16 total_vgdas; + __be16 vgda_size; + }; + +struct lvd { + __be16 lv_ix; + __be16 res2; + __be16 res4; + __be16 maxsize; + __be16 lv_state; + __be16 mirror; + __be16 mirror_policy; + __be16 num_lps; + __be16 res10[8]; + }; + +struct lvname { + char name[64]; + }; + +struct ppe { + __be16 lv_ix; + unsigned short res2; + unsigned short res4; + __be16 lp_ix; + unsigned short res8[12]; + }; + +struct pvd { + char reserved0[16]; + __be16 pp_count; + char reserved18[2]; + __be32 psn_part1; + char reserved24[8]; + struct ppe ppe[1016]; + }; + +#define LVM_MAXLVS 256 + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return (bdev->bd_inode->i_size >> 9) - 1ULL; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @count + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 * buffer, size_t count) +{ + size_t totalreadcount = 0; + + if (!buffer || lba + count / 512 > last_lba(state->bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, lba++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount +=copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_pvd(): reads physical volume descriptor + * @state + * @lba + * + * Description: Returns pvd on success, NULL on error. + * Allocates space for pvd and fill it with disk blocks at @lba + * Notes: remember to free pvd when you're done! + */ +static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct pvd); + struct pvd *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +/** + * alloc_lvn(): reads logical volume names + * @state + * @lba + * + * Description: Returns lvn on success, NULL on error. + * Allocates space for lvn and fill it with disk blocks at @lba + * Notes: remember to free lvn when you're done! + */ +static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct lvname) * LVM_MAXLVS; + struct lvname *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +int aix_partition(struct parsed_partitions *state) +{ + int ret = 0; + Sector sect; + unsigned char *d; + u32 pp_bytes_size; + u32 pp_blocks_size = 0; + u32 vgda_sector = 0; + u32 vgda_len = 0; + int numlvs = 0; + struct pvd *pvd; + struct lv_info { + unsigned short pps_per_lv; + unsigned short pps_found; + unsigned char lv_is_contiguous; + } *lvip; + struct lvname *n = NULL; + + d = read_part_sector(state, 7, §); + if (d) { + struct lvm_rec *p = (struct lvm_rec *)d; + u16 lvm_version = be16_to_cpu(p->version); + char tmp[64]; + + if (lvm_version == 1) { + int pp_size_log2 = be16_to_cpu(p->pp_size); + + pp_bytes_size = 1 << pp_size_log2; + pp_blocks_size = pp_bytes_size / 512; + snprintf(tmp, sizeof(tmp), + " AIX LVM header version %u found\n", + lvm_version); + vgda_len = be32_to_cpu(p->vgda_len); + vgda_sector = be32_to_cpu(p->vgda_psn[0]); + } else { + snprintf(tmp, sizeof(tmp), + " unsupported AIX LVM version %d found\n", + lvm_version); + } + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + } + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { + struct vgda *p = (struct vgda *)d; + + numlvs = be16_to_cpu(p->numlvs); + put_dev_sector(sect); + } + lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); + if (!lvip) + return 0; + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { + struct lvd *p = (struct lvd *)d; + int i; + + n = alloc_lvn(state, vgda_sector + vgda_len - 33); + if (n) { + int foundlvs = 0; + + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { + lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps); + if (lvip[i].pps_per_lv) + foundlvs += 1; + } + } + put_dev_sector(sect); + } + pvd = alloc_pvd(state, vgda_sector + 17); + if (pvd) { + int numpps = be16_to_cpu(pvd->pp_count); + int psn_part1 = be32_to_cpu(pvd->psn_part1); + int i; + int cur_lv_ix = -1; + int next_lp_ix = 1; + int lp_ix; + + for (i = 0; i < numpps; i += 1) { + struct ppe *p = pvd->ppe + i; + unsigned int lv_ix; + + lp_ix = be16_to_cpu(p->lp_ix); + if (!lp_ix) { + next_lp_ix = 1; + continue; + } + lv_ix = be16_to_cpu(p->lv_ix) - 1; + if (lv_ix > state->limit) { + cur_lv_ix = -1; + continue; + } + lvip[lv_ix].pps_found += 1; + if (lp_ix != next_lp_ix) + continue; + if (lp_ix == 1) + cur_lv_ix = lv_ix; + else if (lv_ix != cur_lv_ix) + next_lp_ix = 1; + if (lp_ix == lvip[lv_ix].pps_per_lv) { + char tmp[70]; + + put_partition(state, lv_ix + 1, + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", n[lv_ix].name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + lvip[lv_ix].lv_is_contiguous = 1; + ret = 1; + next_lp_ix = 1; + } else + next_lp_ix += 1; + } + for (i = 0; i < state->limit; i += 1) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + printk("partition %s (%u pp's found) is not contiguous\n", + n[i].name, lvip[i].pps_found); + kfree(pvd); + } + if (n) + kfree(n); + kfree(lvip); + return ret; +} diff --git a/block/partitions/aix.h b/block/partitions/aix.h new file mode 100644 index 000000000000..e0c66a987523 --- /dev/null +++ b/block/partitions/aix.h @@ -0,0 +1 @@ +extern int aix_partition(struct parsed_partitions *state); -- cgit v1.2.3 From e5cab9344f10a5aef142788835eba78a446ef067 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 23 May 2013 10:38:09 +1000 Subject: partitions-add-aix-lvm-partition-support-files-v2 Fix a problem in the discovering of small (1 pp) partitions in presence of discontiguous partitions. Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/partitions/aix.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index b3288d24dacf..ef46cf3594f8 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -257,12 +257,13 @@ int aix_partition(struct parsed_partitions *state) continue; } lvip[lv_ix].pps_found += 1; - if (lp_ix != next_lp_ix) - continue; - if (lp_ix == 1) + if (lp_ix == 1) { cur_lv_ix = lv_ix; - else if (lv_ix != cur_lv_ix) next_lp_ix = 1; + } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) { + next_lp_ix = 1; + continue; + } if (lp_ix == lvip[lv_ix].pps_per_lv) { char tmp[70]; -- cgit v1.2.3 From 5e4f6b47631bcfeffebab613417c87d2856bba85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:38:09 +1000 Subject: partitions-add-aix-lvm-partition-support-files-checkpatch-fixes WARNING: line over 80 characters #119: FILE: block/partitions/aix.c:95: +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 * buffer, size_t count) ERROR: "foo * bar" should be "foo *bar" #119: FILE: block/partitions/aix.c:95: +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 * buffer, size_t count) ERROR: code indent should use tabs where possible #124: FILE: block/partitions/aix.c:100: + return 0;$ WARNING: please, no spaces at the start of a line #124: FILE: block/partitions/aix.c:100: + return 0;$ WARNING: Avoid CamelCase: #128: FILE: block/partitions/aix.c:104: + Sector sect; ERROR: spaces required around that '+=' (ctx:WxV) #137: FILE: block/partitions/aix.c:113: + totalreadcount +=copied; ^ ERROR: do not use assignment in if condition #235: FILE: block/partitions/aix.c:211: + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { ERROR: do not use assignment in if condition #244: FILE: block/partitions/aix.c:220: + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { WARNING: line over 80 characters #252: FILE: block/partitions/aix.c:228: + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { WARNING: line over 80 characters #294: FILE: block/partitions/aix.c:270: + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, WARNING: line over 80 characters #295: FILE: block/partitions/aix.c:271: + lvip[lv_ix].pps_per_lv * pp_blocks_size); WARNING: line over 80 characters #296: FILE: block/partitions/aix.c:272: + snprintf(tmp, sizeof(tmp), " <%s>\n", n[lv_ix].name); WARNING: printk() should include KERN_ facility level #306: FILE: block/partitions/aix.c:282: + printk("partition %s (%u pp's found) is not contiguous\n", WARNING: kfree(NULL) is safe this check is probably not required #311: FILE: block/partitions/aix.c:287: + if (n) + kfree(n); total: 5 errors, 9 warnings, 291 lines checked NOTE: whitespace errors detected, you may wish to use scripts/cleanpatch or scripts/cleanfile ./patches/partitions-add-aix-lvm-partition-support-files.patch has style problems, please review. If any of these errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Philippe De Muyter Signed-off-by: Andrew Morton --- block/partitions/aix.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index ef46cf3594f8..43be471d9b1d 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -92,12 +92,13 @@ static u64 last_lba(struct block_device *bdev) * Description: Reads @count bytes from @state->bdev into @buffer. * Returns number of bytes read on success, 0 on error. */ -static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 * buffer, size_t count) +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, + size_t count) { size_t totalreadcount = 0; if (!buffer || lba + count / 512 > last_lba(state->bdev)) - return 0; + return 0; while (count) { int copied = 512; @@ -110,7 +111,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 * buffer, si memcpy(buffer, data, copied); put_dev_sector(sect); buffer += copied; - totalreadcount +=copied; + totalreadcount += copied; count -= copied; } return totalreadcount; @@ -268,9 +269,10 @@ int aix_partition(struct parsed_partitions *state) char tmp[70]; put_partition(state, lv_ix + 1, - (i + 1 - lp_ix) * pp_blocks_size + psn_part1, - lvip[lv_ix].pps_per_lv * pp_blocks_size); - snprintf(tmp, sizeof(tmp), " <%s>\n", n[lv_ix].name); + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", + n[lv_ix].name); strlcat(state->pp_buf, tmp, PAGE_SIZE); lvip[lv_ix].lv_is_contiguous = 1; ret = 1; @@ -280,12 +282,12 @@ int aix_partition(struct parsed_partitions *state) } for (i = 0; i < state->limit; i += 1) if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) - printk("partition %s (%u pp's found) is not contiguous\n", + pr_warn("partition %s (%u pp's found) is " + "not contiguous\n", n[i].name, lvip[i].pps_found); kfree(pvd); } - if (n) - kfree(n); + kfree(n); kfree(lvip); return ret; } -- cgit v1.2.3 From 79c0f6f4060629fcce7385341447dcbc39f7c4ca Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 23 May 2013 10:38:09 +1000 Subject: partitions-add-aix-lvm-partition-support-files: compile aix.c if configured Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/partitions/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/block/partitions/Makefile b/block/partitions/Makefile index bf26d4a61e4c..d79da0be0b2d 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLOCK) := check.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_AIX_PARTITION) += aix.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o -- cgit v1.2.3 From 889145e5bde7a6b6f3d5a2bfac556945fedeedb6 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 23 May 2013 10:38:10 +1000 Subject: partitions-add-aix-lvm-partition-support-files: add the AIX_PARTITION entry This is the final patch enabling a user to select AIX lvm partitions detection. Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/partitions/Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 75a54e1adbb5..4cebb2f0d2f4 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX of machines called RISCiX. If you say 'Y' here, Linux will be able to read disks partitioned under RISCiX. +config AIX_PARTITION + bool "AIX basic partition table support" if PARTITION_ADVANCED + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM or Motorola PowerPC machines + running AIX. AIX actually uses a Logical Volume Manager, where + "logical volumes" can be spread across one or multiple disks, + but this driver works only for the simple case of partitions which + are contiguous. + Otherwise, say N. + config OSF_PARTITION bool "Alpha OSF partition support" if PARTITION_ADVANCED default y if ALPHA -- cgit v1.2.3 From 6761f5ee9ed265edc250086508ef9b537b5a7462 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 23 May 2013 10:38:10 +1000 Subject: partitions/msdos: enumerate also AIX LVM partitions Graft AIX partitions enumeration into partitions/msdos.c There is already a AIX disks detection logic in msdos.c. When an AIX disk has been found, and if configured to, call the aix partitions recognizer. This avoids removal of AIX disks protection from msdos.c, avoids code duplication, and ensures that AIX partitions enumeration is called before plain msdos partitions enumeration. Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton --- block/partitions/msdos.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 9bf19e6fd949..9123f250b425 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -23,6 +23,7 @@ #include "check.h" #include "msdos.h" #include "efi.h" +#include "aix.h" /* * Many architectures don't like unaligned accesses, while @@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); +#ifdef CONFIG_AIX_PARTITION + return aix_partition(state); +#else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; +#endif } if (!msdos_magic_present(data + 510)) { -- cgit v1.2.3 From af7ce783fc7a5d6131c145e786c749aa814754a5 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Thu, 23 May 2013 10:38:10 +1000 Subject: rapidio/switches: remove tsi500 driver Remove the driver for Tsi500 Parallel RapidIO switch because this device has not been available for several years. Since the first introduction of Tsi500, the parallel RapidIO interface was replaced by the serial RapidIO (sRIO) and therefore there is no value in keeping this driver. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Li Yang Cc: Kumar Gala Signed-off-by: Andrew Morton --- drivers/rapidio/switches/Kconfig | 7 ---- drivers/rapidio/switches/Makefile | 1 - drivers/rapidio/switches/tsi500.c | 78 --------------------------------------- 3 files changed, 86 deletions(-) delete mode 100644 drivers/rapidio/switches/tsi500.c diff --git a/drivers/rapidio/switches/Kconfig b/drivers/rapidio/switches/Kconfig index f47fee5d4563..62d4a065230f 100644 --- a/drivers/rapidio/switches/Kconfig +++ b/drivers/rapidio/switches/Kconfig @@ -26,10 +26,3 @@ config RAPIDIO_CPS_GEN2 default n ---help--- Includes support for ITD CPS Gen.2 serial RapidIO switches. - -config RAPIDIO_TSI500 - bool "Tsi500 Parallel RapidIO switch support" - depends on RAPIDIO - default n - ---help--- - Includes support for IDT Tsi500 parallel RapidIO switch. diff --git a/drivers/rapidio/switches/Makefile b/drivers/rapidio/switches/Makefile index c4d3acc3c715..051cc6b38188 100644 --- a/drivers/rapidio/switches/Makefile +++ b/drivers/rapidio/switches/Makefile @@ -5,5 +5,4 @@ obj-$(CONFIG_RAPIDIO_TSI57X) += tsi57x.o obj-$(CONFIG_RAPIDIO_CPS_XX) += idtcps.o obj-$(CONFIG_RAPIDIO_TSI568) += tsi568.o -obj-$(CONFIG_RAPIDIO_TSI500) += tsi500.o obj-$(CONFIG_RAPIDIO_CPS_GEN2) += idt_gen2.o diff --git a/drivers/rapidio/switches/tsi500.c b/drivers/rapidio/switches/tsi500.c deleted file mode 100644 index 914eddd5aa42..000000000000 --- a/drivers/rapidio/switches/tsi500.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * RapidIO Tsi500 switch support - * - * Copyright 2009-2010 Integrated Device Technology, Inc. - * Alexandre Bounine - * - Modified switch operations initialization. - * - * Copyright 2005 MontaVista Software, Inc. - * Matt Porter - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -#include -#include -#include -#include "../rio.h" - -static int -tsi500_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table, u16 route_destid, u8 route_port) -{ - int i; - u32 offset = 0x10000 + 0xa00 + ((route_destid / 2)&~0x3); - u32 result; - - if (table == 0xff) { - rio_mport_read_config_32(mport, destid, hopcount, offset, &result); - result &= ~(0xf << (4*(route_destid & 0x7))); - for (i=0;i<4;i++) - rio_mport_write_config_32(mport, destid, hopcount, offset + (0x20000*i), result | (route_port << (4*(route_destid & 0x7)))); - } - else { - rio_mport_read_config_32(mport, destid, hopcount, offset + (0x20000*table), &result); - result &= ~(0xf << (4*(route_destid & 0x7))); - rio_mport_write_config_32(mport, destid, hopcount, offset + (0x20000*table), result | (route_port << (4*(route_destid & 0x7)))); - } - - return 0; -} - -static int -tsi500_route_get_entry(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table, u16 route_destid, u8 *route_port) -{ - int ret = 0; - u32 offset = 0x10000 + 0xa00 + ((route_destid / 2)&~0x3); - u32 result; - - if (table == 0xff) - rio_mport_read_config_32(mport, destid, hopcount, offset, &result); - else - rio_mport_read_config_32(mport, destid, hopcount, offset + (0x20000*table), &result); - - result &= 0xf << (4*(route_destid & 0x7)); - *route_port = result >> (4*(route_destid & 0x7)); - if (*route_port > 3) - ret = -1; - - return ret; -} - -static int tsi500_switch_init(struct rio_dev *rdev, int do_enum) -{ - pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev)); - rdev->rswitch->add_entry = tsi500_route_add_entry; - rdev->rswitch->get_entry = tsi500_route_get_entry; - rdev->rswitch->clr_table = NULL; - rdev->rswitch->set_domain = NULL; - rdev->rswitch->get_domain = NULL; - rdev->rswitch->em_init = NULL; - rdev->rswitch->em_handle = NULL; - - return 0; -} - -DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI500, tsi500_switch_init); -- cgit v1.2.3 From 74a3fde49b1a90d10ba9468c6b5074cf5b2707a8 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghiu Date: Thu, 23 May 2013 10:38:11 +1000 Subject: drivers/parport/share.c: use kzalloc Replaced calls to kmalloc and memset with kzalloc. Patch found using coccinelle. Signed-off-by: Alexandru Gheorghiu Signed-off-by: Andrew Morton --- drivers/parport/share.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/parport/share.c b/drivers/parport/share.c index a848e02e6be3..6a83ee1e9178 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -282,14 +282,13 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma, int device; char *name; - tmp = kmalloc(sizeof(struct parport), GFP_KERNEL); + tmp = kzalloc(sizeof(struct parport), GFP_KERNEL); if (!tmp) { printk(KERN_WARNING "parport: memory squeeze\n"); return NULL; } /* Init our structure */ - memset(tmp, 0, sizeof(struct parport)); tmp->base = base; tmp->irq = irq; tmp->dma = dma; -- cgit v1.2.3 From 8fe6c2eb85da360434ff963d67b7595d4c584e09 Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Thu, 23 May 2013 10:38:11 +1000 Subject: drivers/pps/clients/pps-gpio.c: convert to devm_* helpers Signed-off-by: Jan Luebbe Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton --- drivers/pps/clients/pps-gpio.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index d3db26e46489..54c2809021a3 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -74,7 +74,7 @@ static int pps_gpio_setup(struct platform_device *pdev) int ret; const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; - ret = gpio_request(pdata->gpio_pin, pdata->gpio_label); + ret = devm_gpio_request(&pdev->dev, pdata->gpio_pin, pdata->gpio_label); if (ret) { pr_warning("failed to request GPIO %u\n", pdata->gpio_pin); return -EINVAL; @@ -83,7 +83,6 @@ static int pps_gpio_setup(struct platform_device *pdev) ret = gpio_direction_input(pdata->gpio_pin); if (ret) { pr_warning("failed to set pin direction\n"); - gpio_free(pdata->gpio_pin); return -EINVAL; } @@ -109,7 +108,6 @@ static int pps_gpio_probe(struct platform_device *pdev) struct pps_gpio_device_data *data; int irq; int ret; - int err; int pps_default_params; const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; @@ -123,17 +121,14 @@ static int pps_gpio_probe(struct platform_device *pdev) irq = gpio_to_irq(pdata->gpio_pin); if (irq < 0) { pr_err("failed to map GPIO to IRQ: %d\n", irq); - err = -EINVAL; - goto return_error; + return -EINVAL; } /* allocate space for device info */ data = devm_kzalloc(&pdev->dev, sizeof(struct pps_gpio_device_data), GFP_KERNEL); - if (data == NULL) { - err = -ENOMEM; - goto return_error; - } + if (data == NULL) + return -ENOMEM; /* initialize PPS specific parts of the bookkeeping data structure. */ data->info.mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | @@ -152,41 +147,32 @@ static int pps_gpio_probe(struct platform_device *pdev) data->pps = pps_register_source(&data->info, pps_default_params); if (data->pps == NULL) { pr_err("failed to register IRQ %d as PPS source\n", irq); - err = -EINVAL; - goto return_error; + return -EINVAL; } data->irq = irq; data->pdata = pdata; /* register IRQ interrupt handler */ - ret = request_irq(irq, pps_gpio_irq_handler, + ret = devm_request_irq(&pdev->dev, irq, pps_gpio_irq_handler, get_irqf_trigger_flags(pdata), data->info.name, data); if (ret) { pps_unregister_source(data->pps); pr_err("failed to acquire IRQ %d\n", irq); - err = -EINVAL; - goto return_error; + return -EINVAL; } platform_set_drvdata(pdev, data); dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n", irq); return 0; - -return_error: - gpio_free(pdata->gpio_pin); - return err; } static int pps_gpio_remove(struct platform_device *pdev) { struct pps_gpio_device_data *data = platform_get_drvdata(pdev); - const struct pps_gpio_platform_data *pdata = data->pdata; platform_set_drvdata(pdev, NULL); - free_irq(data->irq, data); - gpio_free(pdata->gpio_pin); pps_unregister_source(data->pps); pr_info("removed IRQ %d as PPS source\n", data->irq); return 0; -- cgit v1.2.3 From aa66f1f2f46a491ecb9d6cfd47e2dc24071aeec3 Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Thu, 23 May 2013 10:38:11 +1000 Subject: drivers/pps/clients/pps-gpio.c: convert to module_platform_driver This removes some boilerplate code (no functional changes). Signed-off-by: Jan Luebbe Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton --- drivers/pps/clients/pps-gpio.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index 54c2809021a3..5bd3bf093b54 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -187,23 +187,7 @@ static struct platform_driver pps_gpio_driver = { }, }; -static int __init pps_gpio_init(void) -{ - int ret = platform_driver_register(&pps_gpio_driver); - if (ret < 0) - pr_err("failed to register platform driver\n"); - return ret; -} - -static void __exit pps_gpio_exit(void) -{ - platform_driver_unregister(&pps_gpio_driver); - pr_debug("unregistered platform driver\n"); -} - -module_init(pps_gpio_init); -module_exit(pps_gpio_exit); - +module_platform_driver(pps_gpio_driver); MODULE_AUTHOR("Ricardo Martins "); MODULE_AUTHOR("James Nuss "); MODULE_DESCRIPTION("Use GPIO pin as PPS source"); -- cgit v1.2.3 From d6d95fa4c0269d99b953902c71688b07213a1d08 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dagenais Date: Thu, 23 May 2013 10:38:12 +1000 Subject: drivers/w1/slaves/w1_ds2408.c: add magic sequence to disable P0 test mode Power-up timing The DS2408 is sensitive to the power-on slew rate and can inadvertently power up with a test mode feature enabled. When this occurs, the P0 port does not respond to the Channel Access Write command. For most reliable operation, it is recommended to disable the test mode after every power-on reset using the Disable Test Mode sequence shown below. The 64-bit ROM code must be transmitted in the same bit sequence as with the Match ROM command, i.e., least significant bit first. This precaution is recommended in parasite power mode (VCC pin connected to GND) as well as with VCC power. Disable Test Mode: RST,PD,96h,<64-bit DS2408 ROM Code>,3Ch,RST,PD Signed-off-by: Jean-Francois Dagenais Cc: Evgeniy Polyakov Cc: Greg KH Signed-off-by: Andrew Morton --- drivers/w1/slaves/w1_ds2408.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/w1/slaves/w1_ds2408.c b/drivers/w1/slaves/w1_ds2408.c index 91cc2cdf02c0..01fe4483778f 100644 --- a/drivers/w1/slaves/w1_ds2408.c +++ b/drivers/w1/slaves/w1_ds2408.c @@ -302,7 +302,32 @@ error: return -EIO; } +/** + * This is a special sequence we must do to ensure the P0 output is not stuck + * in test mode. This is described in rev 2 of the ds2408's datasheet + * (http://datasheets.maximintegrated.com/en/ds/DS2408.pdf) under + * "APPLICATION INFORMATION/Power-up timing". + */ +static int w1_f29_disable_test_mode(struct w1_slave *sl) +{ + int res; + u8 magic[10] = {0x96, }; + u64 rn = le64_to_cpu(*((u64*)&sl->reg_num)); + memcpy(&magic[1], &rn, 8); + magic[9] = 0x3C; + + mutex_lock(&sl->master->bus_mutex); + res = w1_reset_bus(sl->master); + if (res) + goto out; + w1_write_block(sl->master, magic, ARRAY_SIZE(magic)); + + res = w1_reset_bus(sl->master); +out: + mutex_unlock(&sl->master->bus_mutex); + return res; +} static struct bin_attribute w1_f29_sysfs_bin_files[] = { { @@ -363,6 +388,10 @@ static int w1_f29_add_slave(struct w1_slave *sl) int err = 0; int i; + err = w1_f29_disable_test_mode(sl); + if (err) + return err; + for (i = 0; i < ARRAY_SIZE(w1_f29_sysfs_bin_files) && !err; ++i) err = sysfs_create_bin_file( &sl->dev.kobj, -- cgit v1.2.3 From 2f5b9b6d6856bd29a8e0c97834590d60cbb98c87 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:38:12 +1000 Subject: drivers-w1-slaves-w1_ds2408c-add-magic-sequence-to-disable-p0-test-mode-fix don't use kerenldoc token to introduce a non-kerneldoc comment, tweak whitespace Cc: Evgeniy Polyakov Cc: Greg KH Cc: Jean-Francois Dagenais Signed-off-by: Andrew Morton --- drivers/w1/slaves/w1_ds2408.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/w1/slaves/w1_ds2408.c b/drivers/w1/slaves/w1_ds2408.c index 01fe4483778f..cb8a8e5d9573 100644 --- a/drivers/w1/slaves/w1_ds2408.c +++ b/drivers/w1/slaves/w1_ds2408.c @@ -302,7 +302,7 @@ error: return -EIO; } -/** +/* * This is a special sequence we must do to ensure the P0 output is not stuck * in test mode. This is described in rev 2 of the ds2408's datasheet * (http://datasheets.maximintegrated.com/en/ds/DS2408.pdf) under @@ -313,6 +313,7 @@ static int w1_f29_disable_test_mode(struct w1_slave *sl) int res; u8 magic[10] = {0x96, }; u64 rn = le64_to_cpu(*((u64*)&sl->reg_num)); + memcpy(&magic[1], &rn, 8); magic[9] = 0x3C; -- cgit v1.2.3 From 2b5567c0268c81557ecdd97bb12fa8f0015ed95b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 May 2013 10:38:13 +1000 Subject: relay: fix timer madness When I'm using below ktap script to tracing all event tracepoints, without this patch, the system will hang in few seconds, the patch indeed fix the problem as the changelog pointed. function eventfun (e) { printf("%d %d\t%s\t%s", cpu(), pid(), execname(), e.annotate) } kdebug.probe("tp:", eventfun) kdebug.probe_end(function () { printf("probe end\n") }) This patch is old, I can found the original patch discussion in 2007. http://marc.info/?l=linux-kernel&m=118544794717162&w=2 (In that mail thread, the patch didn't fix that problem, but it fix the problem I encountered now) Ingo's original changelog: Remove timer calls (!!!) from deep within the tracing infrastructure. This was totally bogus code that can cause lockups and worse. Poll the buffer every 2 jiffies for now. Signed-off-by: Ingo Molnar Signed-off-by: "zhangwei(Jovi)" Cc: Steven Rostedt Cc: Jens Axboe Cc: Al Viro Cc: Eric Dumazet Signed-off-by: Andrew Morton --- kernel/relay.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..e03440ba0f20 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -339,6 +339,10 @@ static void wakeup_readers(unsigned long data) { struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); + /* + * Stupid polling for now: + */ + mod_timer(&buf->timer, jiffies + 1); } /** @@ -356,6 +360,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + mod_timer(&buf->timer, jiffies + 1); } else del_timer_sync(&buf->timer); @@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) else buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - mod_timer(&buf->timer, jiffies + 1); } old = buf->data; -- cgit v1.2.3 From 08adff7b633c38540a9552d0622bf9327031505d Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 23 May 2013 10:38:19 +1000 Subject: lib: add weak clz/ctz functions Some architectures need __c[lt]z[sd]i2() for __builtin_c[lt]z[ll] and It causes build failure. They can be implemented using the fls()/__ffs() and overridden by linking arch-specific versions may not be implemented yet. This is required by "lib: add lz4 compressor module". Reference: https://lkml.org/lkml/2013/4/18/603 Signed-off-by: Chanho Min Reported-by: Geert Uytterhoeven Cc: "Darrick J. Wong" Cc: Bob Pearson Cc: Richard Weinberger Cc: Herbert Xu Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton --- lib/Makefile | 2 +- lib/clz_ctz.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 lib/clz_ctz.c diff --git a/lib/Makefile b/lib/Makefile index 386db4bbc265..19cda4482505 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,7 +23,7 @@ lib-y += kobject.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o \ + gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o\ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c new file mode 100644 index 000000000000..a8f8379eb49f --- /dev/null +++ b/lib/clz_ctz.c @@ -0,0 +1,58 @@ +/* + * lib/clz_ctz.c + * + * Copyright (C) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * __c[lt]z[sd]i2 can be overridden by linking arch-specific versions. + */ + +#include +#include + +int __weak __ctzsi2(int val) +{ + return __ffs(val); +} +EXPORT_SYMBOL(__ctzsi2); + +int __weak __clzsi2(int val) +{ + return 32 - fls(val); +} +EXPORT_SYMBOL(__clzsi2); + +#if BITS_PER_LONG == 32 + +int __weak __clzdi2(long val) +{ + return 32 - fls((int)val); +} +EXPORT_SYMBOL(__clzdi2); + +int __weak __ctzdi2(long val) +{ + return __ffs((u32)val); +} +EXPORT_SYMBOL(__ctzdi2); + +#elif BITS_PER_LONG == 64 + +int __weak __clzdi2(long val) +{ + return 64 - fls64((u64)val); +} +EXPORT_SYMBOL(__clzdi2); + +int __weak __ctzdi2(long val) +{ + return __ffs64((u64)val); +} +EXPORT_SYMBOL(__ctzdi2); + +#else +#error BITS_PER_LONG not 32 or 64 +#endif -- cgit v1.2.3 From 06ec246bfc4c9b3f8ab6b26553aaec34d0846672 Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:19 +1000 Subject: decompressor: add LZ4 decompressor module Add support for LZ4 decompression in the Linux Kernel. LZ4 Decompression APIs for kernel are based on LZ4 implementation by Yann Collet. Benchmark Results(PATCH v3) Compiler: Linaro ARM gcc 4.6.2 1. ARMv7, 1.5GHz based board Kernel: linux 3.4 Uncompressed Kernel Size: 14MB Compressed Size Decompression Speed LZO 6.7MB 20.1MB/s, 25.2MB/s(UA) LZ4 7.3MB 29.1MB/s, 45.6MB/s(UA) 2. ARMv7, 1.7GHz based board Kernel: linux 3.7 Uncompressed Kernel Size: 14MB Compressed Size Decompression Speed LZO 6.0MB 34.1MB/s, 52.2MB/s(UA) LZ4 6.5MB 86.7MB/s - UA: Unaligned memory Access support - Latest patch set for LZO applied This patch set is for adding support for LZ4-compressed Kernel. LZ4 is a very fast lossless compression algorithm and it also features an extremely fast decoder [1]. But we have five of decompressors already and one question which does arise, however, is that of where do we stop adding new ones? This issue had been discussed and came to the conclusion [2]. Russell King said that we should have: - one decompressor which is the fastest - one decompressor for the highest compression ratio - one popular decompressor (eg conventional gzip) If we have a replacement one for one of these, then it should do exactly that: replace it. The benchmark shows that an 8% increase in image size vs a 66% increase in decompression speed compared to LZO(which has been known as the fastest decompressor in the Kernel). Therefore the "fast but may not be small" compression title has clearly been taken by LZ4 [3]. [1] http://code.google.com/p/lz4/ [2] http://thread.gmane.org/gmane.linux.kbuild.devel/9157 [3] http://thread.gmane.org/gmane.linux.kbuild.devel/9347 LZ4 homepage: http://fastcompression.blogspot.com/p/lz4.html LZ4 source repository: http://code.google.com/p/lz4/ Signed-off-by: Kyungsik Lee Signed-off-by: Yann Collet Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Signed-off-by: Andrew Morton --- include/linux/lz4.h | 51 ++++++++ lib/lz4/lz4_decompress.c | 326 +++++++++++++++++++++++++++++++++++++++++++++++ lib/lz4/lz4defs.h | 94 ++++++++++++++ 3 files changed, 471 insertions(+) create mode 100644 include/linux/lz4.h create mode 100644 lib/lz4/lz4_decompress.c create mode 100644 lib/lz4/lz4defs.h diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000000..7f6c75a093f8 --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,51 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len); +#endif diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000000..dcc89753af65 --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,326 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013 LG Electronics Co., Ltd. (http://www.lge.com/) + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include +#include +#endif +#include + +#include + +#include "lz4defs.h" + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *)ip) - source)); +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ + + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *) ip) - source)); +} + +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000000..43ac31d63f36 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,94 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define BYTE u8 +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; + +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) -- cgit v1.2.3 From 2d9605956f780fc2f9a67677012680529e1871e1 Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:20 +1000 Subject: lib: add support for LZ4-compressed kernel Add support for extracting LZ4-compressed kernel images, as well as LZ4-compressed ramdisk images in the kernel boot process. Signed-off-by: Kyungsik Lee Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Cc: Yann Collet Signed-off-by: Andrew Morton --- include/linux/decompress/unlz4.h | 10 +++ init/Kconfig | 17 +++- lib/Kconfig | 7 ++ lib/Makefile | 2 + lib/decompress.c | 5 ++ lib/decompress_unlz4.c | 187 +++++++++++++++++++++++++++++++++++++++ lib/lz4/Makefile | 1 + lib/lz4/lz4_decompress.c | 2 +- scripts/Makefile.lib | 5 ++ usr/Kconfig | 9 ++ 10 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 include/linux/decompress/unlz4.h create mode 100644 lib/decompress_unlz4.c create mode 100644 lib/lz4/Makefile diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h new file mode 100644 index 000000000000..d5b68bf3ec92 --- /dev/null +++ b/include/linux/decompress/unlz4.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZ4_H +#define DECOMPRESS_UNLZ4_H + +int unlz4(unsigned char *inbuf, int len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *pos, + void(*error)(char *x)); +#endif diff --git a/init/Kconfig b/init/Kconfig index 9ddf124ee40d..b672bb49f3e2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -98,10 +98,13 @@ config HAVE_KERNEL_XZ config HAVE_KERNEL_LZO bool +config HAVE_KERNEL_LZ4 + bool + choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -168,6 +171,18 @@ config KERNEL_LZO size is about 10% bigger than gzip; however its speed (both compression and decompression) is the fastest. +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + . + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + endchoice config DEFAULT_HOSTNAME diff --git a/lib/Kconfig b/lib/Kconfig index 339d5a4292f8..b65c87fd8a4b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -191,6 +191,9 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # @@ -215,6 +218,10 @@ config DECOMPRESS_LZO select LZO_DECOMPRESS tristate +config DECOMPRESS_LZ4 + select LZ4_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Makefile b/lib/Makefile index 19cda4482505..d632ee21b571 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -75,6 +75,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ @@ -83,6 +84,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o +lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/decompress.c b/lib/decompress.c index f8fdedaf7b3d..4d1cd0397aab 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,9 @@ #ifndef CONFIG_DECOMPRESS_LZO # define unlzo NULL #endif +#ifndef CONFIG_DECOMPRESS_LZ4 +# define unlz4 NULL +#endif struct compress_format { unsigned char magic[2]; @@ -45,6 +49,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0x5d, 0x00}, "lzma", unlzma }, { {0xfd, 0x37}, "xz", unxz }, { {0x89, 0x4c}, "lzo", unlzo }, + { {0x02, 0x21}, "lz4", unlz4 }, { {0, 0}, NULL, NULL } }; diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c new file mode 100644 index 000000000000..3e67cfad16ad --- /dev/null +++ b/lib/decompress_unlz4.c @@ -0,0 +1,187 @@ +/* + * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifdef STATIC +#define PREBOOT +#include "lz4/lz4_decompress.c" +#else +#include +#endif +#include +#include +#include +#include + +#include + +/* + * Note: Uncompressed chunk size is used in the compressor side + * (userspace side for compression). + * It is hardcoded because there is not proper way to extract it + * from the binary stream which is generated by the preliminary + * version of LZ4 tool so far. + */ +#define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) +#define ARCHIVE_MAGICNUMBER 0x184C2102 + +STATIC inline int INIT unlz4(u8 *input, int in_len, + int (*fill) (void *, unsigned int), + int (*flush) (void *, unsigned int), + u8 *output, int *posp, + void (*error) (char *x)) +{ + int ret = -1; + size_t chunksize = 0; + size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE; + u8 *inp; + u8 *inp_start; + u8 *outp; + int size = in_len; +#ifdef PREBOOT + size_t out_len = get_unaligned_le32(input + in_len); +#endif + size_t dest_len; + + + if (output) { + outp = output; + } else if (!flush) { + error("NULL output pointer and no flush function provided"); + goto exit_0; + } else { + outp = large_malloc(uncomp_chunksize); + if (!outp) { + error("Could not allocate output buffer"); + goto exit_0; + } + } + + if (input && fill) { + error("Both input pointer and fill function provided,"); + goto exit_1; + } else if (input) { + inp = input; + } else if (!fill) { + error("NULL input pointer and missing fill function"); + goto exit_1; + } else { + inp = large_malloc(lz4_compressbound(uncomp_chunksize)); + if (!inp) { + error("Could not allocate input buffer"); + goto exit_1; + } + } + inp_start = inp; + + if (posp) + *posp = 0; + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + } else { + error("invalid header"); + goto exit_2; + } + + if (posp) + *posp += 4; + + for (;;) { + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + if (posp) + *posp += 4; + continue; + } + inp += 4; + size -= 4; + + if (posp) + *posp += 4; + + if (fill) { + if (chunksize > lz4_compressbound(uncomp_chunksize)) { + error("chunk length is longer than allocated"); + goto exit_2; + } + fill(inp, chunksize); + } +#ifdef PREBOOT + if (out_len >= uncomp_chunksize) { + dest_len = uncomp_chunksize; + out_len -= dest_len; + } else + dest_len = out_len; + ret = lz4_decompress(inp, &chunksize, outp, dest_len); +#else + dest_len = uncomp_chunksize; + ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, + &dest_len); +#endif + if (ret < 0) { + error("Decoding failed"); + goto exit_2; + } + + if (flush && flush(outp, dest_len) != dest_len) + goto exit_2; + if (output) + outp += dest_len; + if (posp) + *posp += chunksize; + + size -= chunksize; + + if (size == 0) + break; + else if (size < 0) { + error("data corrupted"); + goto exit_2; + } + + inp += chunksize; + if (fill) + inp = inp_start; + } + + ret = 0; +exit_2: + if (!input) + large_free(inp_start); +exit_1: + if (!output) + large_free(outp); +exit_0: + return ret; +} + +#ifdef PREBOOT +STATIC int INIT decompress(unsigned char *buf, int in_len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *posp, + void(*error)(char *x) + ) +{ + return unlz4(buf, in_len - 4, fill, flush, output, posp, error); +} +#endif diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000000..7f548c6d1c5c --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index dcc89753af65..d3414eae73a1 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -1,7 +1,7 @@ /* * LZ4 Decompressor for Linux kernel * - * Copyright (C) 2013 LG Electronics Co., Ltd. (http://www.lge.com/) + * Copyright (C) 2013, LG Electronics, Kyungsik Lee * * Based on LZ4 implementation by Yann Collet. * diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index f97869f1f09b..53dd80808334 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -311,6 +311,11 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) +quiet_cmd_lz4 = LZ4 $@ +cmd_lz4 = (cat $(filter-out FORCE,$^) | \ + lz4demo -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + # U-Boot mkimage # --------------------------------------------------------------------------- diff --git a/usr/Kconfig b/usr/Kconfig index 085872bb2bb5..642f503d3e9f 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -90,6 +90,15 @@ config RD_LZO Support loading of a LZO encoded initial ramdisk or cpio buffer If unsure, say N. +config RD_LZ4 + bool "Support initial ramdisks compressed using LZ4" if EXPERT + default !EXPERT + depends on BLK_DEV_INITRD + select DECOMPRESS_LZ4 + help + Support loading of a LZ4 encoded initial ramdisk or cpio buffer + If unsure, say N. + choice prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!="" help -- cgit v1.2.3 From 2ac699da97fb193f07a32031efec2983078b08ae Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:20 +1000 Subject: kbuild: fix for updated LZ4 tool with the new streaming format LZ4 has been updated with LZ4 Streaming Format specification(v1.3). lz4demo is replaced by lz4c. lz4c supports both the new streaming and legacy format with -l option. This patch makes use of lz4c to support legacy format which is used for LZ4 De/compression in the linux kernel. Link: https://code.google.com/p/lz4/source/checkout Signed-off-by: Kyungsik Lee Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Cc: Yann Collet Cc: Chanho Min Signed-off-by: Andrew Morton --- scripts/Makefile.lib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 53dd80808334..6031e2380638 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -313,7 +313,7 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ quiet_cmd_lz4 = LZ4 $@ cmd_lz4 = (cat $(filter-out FORCE,$^) | \ - lz4demo -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) # U-Boot mkimage -- cgit v1.2.3 From c61a2630827052797b8d77a0b7a6d69e9f33d721 Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:20 +1000 Subject: arm: add support for LZ4-compressed kernel Integrates the LZ4 decompression code to the arm pre-boot code. Signed-off-by: Kyungsik Lee Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Cc: Yann Collet Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 1 + arch/arm/boot/compressed/.gitignore | 1 + arch/arm/boot/compressed/Makefile | 6 +++++- arch/arm/boot/compressed/decompress.c | 4 ++++ arch/arm/boot/compressed/piggy.lz4.S | 6 ++++++ 5 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 arch/arm/boot/compressed/piggy.lz4.S diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2d6d6b232fd0..4315d1827391 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -40,6 +40,7 @@ config ARM select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore index f79a08efe000..47279aa96a6a 100644 --- a/arch/arm/boot/compressed/.gitignore +++ b/arch/arm/boot/compressed/.gitignore @@ -6,6 +6,7 @@ piggy.gzip piggy.lzo piggy.lzma piggy.xzkern +piggy.lz4 vmlinux vmlinux.lds diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 3580d57ea218..001a13a0cf6f 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -27,6 +27,9 @@ OBJS += misc.o decompress.o ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o endif +ifeq ($(CONFIG_KERNEL_LZ4),y) +CFLAGS_decompress.o := -Os +endif FONTC = $(srctree)/drivers/video/console/font_acorn_8x8.c # string library code (-Os is enforced to keep it much smaller) @@ -91,6 +94,7 @@ suffix_$(CONFIG_KERNEL_GZIP) = gzip suffix_$(CONFIG_KERNEL_LZO) = lzo suffix_$(CONFIG_KERNEL_LZMA) = lzma suffix_$(CONFIG_KERNEL_XZ) = xzkern +suffix_$(CONFIG_KERNEL_LZ4) = lz4 # Borrowed libfdt files for the ATAG compatibility mode @@ -115,7 +119,7 @@ targets := vmlinux vmlinux.lds \ font.o font.c head.o misc.o $(OBJS) # Make sure files are removed during clean -extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \ +extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \ lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs) ifeq ($(CONFIG_FUNCTION_TRACER),y) diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index 24b0475cb8bf..bd245d34952d 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -51,6 +51,10 @@ extern char * strstr(const char * s1, const char *s2); #include "../../../../lib/decompress_unxz.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) { return decompress(input, len, NULL, NULL, output, NULL, error); diff --git a/arch/arm/boot/compressed/piggy.lz4.S b/arch/arm/boot/compressed/piggy.lz4.S new file mode 100644 index 000000000000..3d9a575618a3 --- /dev/null +++ b/arch/arm/boot/compressed/piggy.lz4.S @@ -0,0 +1,6 @@ + .section .piggydata,#alloc + .globl input_data +input_data: + .incbin "arch/arm/boot/compressed/piggy.lz4" + .globl input_data_end +input_data_end: -- cgit v1.2.3 From f7deda2c6e872f2d79ef9148430bb766251842f1 Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:21 +1000 Subject: arm: Remove enforced Os flag for LZ4 decompressor -Os is enforced here, based on the test result of decompression time below, slightly faster than -O2. But further tests with UA show that using -O2 will be the right choice especially in the case of the unaligned access enabled and the gap, few counts in the normal decompression mode is small enough to remove -Os. Decompression Time(counts) Normal UA enabled -Os 6717 3447 -O2 6720 2728 Note: ARM v7, Kernel 3.4 counter freq. = 32768 HZ UA(Unaligned Access) gcc version 4.6.2 Signed-off-by: Kyungsik Lee Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Cc: Yann Collet Signed-off-by: Andrew Morton --- arch/arm/boot/compressed/Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 001a13a0cf6f..198a4ad89578 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -27,9 +27,6 @@ OBJS += misc.o decompress.o ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o endif -ifeq ($(CONFIG_KERNEL_LZ4),y) -CFLAGS_decompress.o := -Os -endif FONTC = $(srctree)/drivers/video/console/font_acorn_8x8.c # string library code (-Os is enforced to keep it much smaller) -- cgit v1.2.3 From 410a763257d830cb2304b5bc4755f7ea9e0d340e Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:21 +1000 Subject: x86: add support for LZ4-compressed kernel Integrate the LZ4 decompression code to the x86 pre-boot code. Signed-off-by: Kyungsik Lee Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Cc: Yann Collet Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/Makefile | 6 +++++- arch/x86/boot/compressed/misc.c | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2eccdd64a92..0d3e6cd80013 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -65,6 +65,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZ4 select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5ef205c5f37b..dcd90df10ab4 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,8 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo +targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lz4) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_LZ4) := lz4 quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7cb56c6ca351..0319c88290a5 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -145,6 +145,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzo.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + static void scroll(void) { int i; -- cgit v1.2.3 From 0e18a93f9e19b511ba5ecb74c6b1155fc7f162ec Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Thu, 23 May 2013 10:38:21 +1000 Subject: x86, doc: Add LZ4 magic number for the new compression Documentation/x86/boot.txt is updated to list the LZ4 magic number. This LZ4 magic number is used for the new compression format. Signed-off-by: Kyungsik Lee Acked-by: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Cc: Yann Collet Signed-off-by: Andrew Morton --- Documentation/x86/boot.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 3840b6f28afb..fc66d42422ee 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -657,9 +657,10 @@ Protocol: 2.08+ uncompressed data should be determined using the standard magic numbers. The currently supported compression formats are gzip (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A), LZMA - (magic number 5D 00), and XZ (magic number FD 37). The uncompressed - payload is currently always ELF (magic number 7F 45 4C 46). - + (magic number 5D 00), XZ (magic number FD 37), and LZ4 (magic number + 02 21). The uncompressed payload is currently always ELF (magic + number 7F 45 4C 46). + Field name: payload_length Type: read Offset/size: 0x24c/4 -- cgit v1.2.3 From 1413e00ce877e7234de390caafcf45f305c68b7a Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 23 May 2013 10:38:22 +1000 Subject: lib: add lz4 compressor module This patchset is for supporting LZ4 compression and the crypto API using it. As shown below, the size of data is a little bit bigger but compressing speed is faster under the enabled unaligned memory access. We can use lz4 de/compression through crypto API as well. Also, It will be useful for another potential user of lz4 compression. lz4 Compression Benchmark: Compiler: ARM gcc 4.6.4 ARMv7, 1 GHz based board Kernel: linux 3.4 Uncompressed data Size: 101 MB Compressed Size compression Speed LZO 72.1MB 32.1MB/s, 33.0MB/s(UA) LZ4 75.1MB 30.4MB/s, 35.9MB/s(UA) LZ4HC 59.8MB 2.4MB/s, 2.5MB/s(UA) - UA: Unaligned memory Access support - Latest patch set for LZO applied This patch: Add support for LZ4 compression in the Linux Kernel. LZ4 Compression APIs for kernel are based on LZ4 implementation by Yann Collet and were changed for kernel coding style. LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html LZ4 source repository : http://code.google.com/p/lz4/ svn revision : r90 Two APIs are added: lz4_compress() support basic lz4 compression whereas lz4hc_compress() support high compression or CPU performance get lower but compression ratio get higher. Also, we require the pre-allocated working memory with the defined size and destination buffer must be allocated with the size of lz4_compressbound. Signed-off-by: Chanho Min Cc: "Darrick J. Wong" Cc: Bob Pearson Cc: Richard Weinberger Cc: Herbert Xu Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton --- include/linux/lz4.h | 36 ++++ lib/Kconfig | 6 + lib/Makefile | 2 + lib/lz4/Makefile | 2 + lib/lz4/lz4_compress.c | 443 ++++++++++++++++++++++++++++++++++++++ lib/lz4/lz4defs.h | 66 +++++- lib/lz4/lz4hc_compress.c | 539 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1092 insertions(+), 2 deletions(-) create mode 100644 lib/lz4/lz4_compress.c create mode 100644 lib/lz4/lz4hc_compress.c diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 7f6c75a093f8..d21c13f10a64 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -9,6 +9,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) +#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) /* * lz4_compressbound() @@ -20,6 +22,40 @@ static inline size_t lz4_compressbound(size_t isize) return isize + (isize / 255) + 16; } +/* + * lz4_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + + /* + * lz4hc_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + /* * lz4_decompress() * src : source address of the compressed data diff --git a/lib/Kconfig b/lib/Kconfig index b65c87fd8a4b..c2ce4fd8b12b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -191,6 +191,12 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_COMPRESS + tristate + +config LZ4HC_COMPRESS + tristate + config LZ4_DECOMPRESS tristate diff --git a/lib/Makefile b/lib/Makefile index d632ee21b571..af911db40b88 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -75,6 +75,8 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile index 7f548c6d1c5c..8085d04e9309 100644 --- a/lib/lz4/Makefile +++ b/lib/lz4/Makefile @@ -1 +1,3 @@ +obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c new file mode 100644 index 000000000000..fd94058bd7f9 --- /dev/null +++ b/lib/lz4/lz4_compress.c @@ -0,0 +1,443 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +/* + * LZ4_compressCtx : + * ----------------- + * Compress 'isize' bytes from 'source' into an output buffer 'dest' of + * maximum size 'maxOutputSize'. * If it cannot achieve it, compression + * will stop, and result of the function will be zero. + * return : the number of bytes written in buffer 'dest', or 0 if the + * compression fails + */ +static inline int lz4_compressctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + HTYPE *hashtable = (HTYPE *)ctx; + const u8 *ip = (u8 *)source; +#if LZ4_ARCH64 + const BYTE * const base = ip; +#else + const int base = 0; +#endif + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardh = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (unlikely(forwardip > mflimit)) + goto _last_literals; + + forwardh = LZ4_HASH_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = ip - base; + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + + if (length >= (int)RUN_MASK) { + int len; + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + /* Encode MatchLength */ + length = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) + return 0; + if (length >= (int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length -= 510) { + *op++ = 255; + *op++ = 255; + } + if (length > 254) { + length -= 255; + *op++ = 255; + } + *op++ = (u8)length; + } else + *token += length; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + /* Test next position */ + ref = base + hashtable[LZ4_HASH_VALUE(ip)]; + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (((char *)op - dest) + lastrun + 1 + + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) + return 0; + + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + + /* End */ + return (int)(((char *)op) - dest); +} + +static inline int lz4_compress64kctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + u16 *hashtable = (u16 *)ctx; + const u8 *ip = (u8 *) source; + const u8 *anchor = ip; + const u8 *const base = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int len, length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (forwardip > mflimit) + goto _last_literals; + + forwardh = LZ4_HASH64K_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = (u16)(ip - base); + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) + && (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + + while (ip < MATCHLIMIT - (STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + + /* Encode MatchLength */ + len = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return 0; + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (u8)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + + /* Test next position */ + ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; + hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) + return 0; + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int)(((char *)op) - dest); +} + +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + if (src_len < LZ4_64KLIMIT) + out_len = lz4_compress64kctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + else + out_len = lz4_compressctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + + return 0; +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h index 43ac31d63f36..abcecdc2d0f2 100644 --- a/lib/lz4/lz4defs.h +++ b/lib/lz4/lz4defs.h @@ -22,23 +22,40 @@ * Architecture-specific macros */ #define BYTE u8 +typedef struct _U16_S { u16 v; } U16_S; +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) -typedef struct _U32_S { u32 v; } U32_S; -typedef struct _U64_S { u64 v; } U64_S; +#define A16(x) (((U16_S *)(x))->v) #define A32(x) (((U32_S *)(x))->v) #define A64(x) (((U64_S *)(x))->v) #define PUT4(s, d) (A32(d) = A32(s)) #define PUT8(s, d) (A64(d) = A64(s)) +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + A16(p) = v; \ + p += 2; \ + } while (0) #else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ +#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) +#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) +#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) + #define PUT4(s, d) \ put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) #define PUT8(s, d) \ put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) + +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + put_unaligned(v, (u16 *)(p)); \ + p += 2; \ + } while (0) #endif #define COPYLENGTH 8 @@ -46,6 +63,29 @@ typedef struct _U64_S { u64 v; } U64_S; #define ML_MASK ((1U << ML_BITS) - 1) #define RUN_BITS (8 - ML_BITS) #define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) +#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - (MEMORY_USAGE-2))) +#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASHLOG64K)) +#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASH_LOG)) #if LZ4_ARCH64/* 64-bit */ #define STEPSIZE 8 @@ -65,6 +105,13 @@ typedef struct _U64_S { u64 v; } U64_S; LZ4_WILDCOPY(s, d, e); \ } \ } while (0) +#define HTYPE u32 + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) +#endif #else /* 32-bit */ #define STEPSIZE 4 @@ -83,6 +130,14 @@ typedef struct _U64_S { u64 v; } U64_S; } while (0) #define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const u8* + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) +#endif + #endif #define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ @@ -92,3 +147,10 @@ typedef struct _U64_S { u64 v; } U64_S; do { \ LZ4_COPYPACKET(s, d); \ } while (d < e) + +#define LZ4_BLINDCOPY(s, d, l) \ + do { \ + u8 *e = (d) + l; \ + LZ4_WILDCOPY(s, d, e); \ + d = e; \ + } while (0) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c new file mode 100644 index 000000000000..a9a9c2a00c5c --- /dev/null +++ b/lib/lz4/lz4hc_compress.c @@ -0,0 +1,539 @@ +/* + * LZ4 HC - High Compression Mode of LZ4 + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +struct lz4hc_data { + const u8 *base; + HTYPE hashtable[HASHTABLESIZE]; + u16 chaintable[MAXD]; + const u8 *nexttoupdate; +} __attribute__((__packed__)); + +static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +{ + memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); + memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); + +#if LZ4_ARCH64 + hc4->nexttoupdate = base + 1; +#else + hc4->nexttoupdate = base; +#endif + hc4->base = base; + return 1; +} + +/* Update chains up to ip (excluded) */ +static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +{ + u16 *chaintable = hc4->chaintable; + HTYPE *hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + + while (hc4->nexttoupdate < ip) { + const u8 *p = hc4->nexttoupdate; + size_t delta = p - (hashtable[HASH_VALUE(p)] + base); + if (delta > MAX_DISTANCE) + delta = MAX_DISTANCE; + chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; + hashtable[HASH_VALUE(p)] = (p) - base; + hc4->nexttoupdate++; + } +} + +static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, + const u8 *const matchlimit) +{ + const u8 *p1t = p1; + + while (p1t < matchlimit - (STEPSIZE - 1)) { +#if LZ4_ARCH64 + u64 diff = A64(p2) ^ A64(p1t); +#else + u32 diff = A32(p2) ^ A32(p1t); +#endif + if (!diff) { + p1t += STEPSIZE; + p2 += STEPSIZE; + continue; + } + p1t += LZ4_NBCOMMONBYTES(diff); + return p1t - p1; + } +#if LZ4_ARCH64 + if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { + p1t += 4; + p2 += 4; + } +#endif + + if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { + p1t += 2; + p2 += 2; + } + if ((p1t < matchlimit) && (*p2 == *p1t)) + p1t++; + return p1t - p1; +} + +static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; + const u8 *ref; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + int nbattempts = MAX_NB_ATTEMPTS; + size_t repl = 0, ml = 0; + u16 delta; + + /* HC4 match finder */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + /* potential repetition */ + if (ref >= ip-4) { + /* confirmed */ + if (A32(ref) == A32(ip)) { + delta = (u16)(ip-ref); + repl = ml = lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + while ((ref >= ip - MAX_DISTANCE) && nbattempts) { + nbattempts--; + if (*(ref + ml) == *(ip + ml)) { + if (A32(ref) == A32(ip)) { + size_t mlt = + lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { + ml = mlt; + *matchpos = ref; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + /* Complete table */ + if (repl) { + const BYTE *ptr = ip; + const BYTE *end; + end = ip + repl - (MINMATCH-1); + /* Pre-Load */ + while (ptr < end - delta) { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + ptr++; + } + do { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + /* Head of chain */ + hashtable[HASH_VALUE(ptr)] = (ptr) - base; + ptr++; + } while (ptr < end); + hc4->nexttoupdate = end; + } + + return (int)ml; +} + +static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, + const u8 **matchpos, const u8 **startpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + const u8 *ref; + int nbattempts = MAX_NB_ATTEMPTS; + int delta = (int)(ip - startlimit); + + /* First Match */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) + && (nbattempts)) { + nbattempts--; + if (*(startlimit + longest) == *(ref - delta + longest)) { + if (A32(ref) == A32(ip)) { + const u8 *reft = ref + MINMATCH; + const u8 *ipt = ip + MINMATCH; + const u8 *startt = ip; + + while (ipt < matchlimit-(STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(reft) ^ A64(ipt); + #else + u32 diff = A32(reft) ^ A32(ipt); + #endif + + if (!diff) { + ipt += STEPSIZE; + reft += STEPSIZE; + continue; + } + ipt += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ipt < (matchlimit - 3)) + && (A32(reft) == A32(ipt))) { + ipt += 4; + reft += 4; + } + ipt += 2; + #endif + if ((ipt < (matchlimit - 1)) + && (A16(reft) == A16(ipt))) { + reft += 2; + } + if ((ipt < matchlimit) && (*reft == *ipt)) + ipt++; +_endcount: + reft = ref; + + while ((startt > startlimit) + && (reft > hc4->base) + && (startt[-1] == reft[-1])) { + startt--; + reft--; + } + + if ((ipt - startt) > longest) { + longest = (int)(ipt - startt); + *matchpos = reft; + *startpos = startt; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + return longest; +} + +static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, + int ml, const u8 *ref) +{ + int length, len; + u8 *token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *(*op)++ = 255; + *(*op)++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(*anchor, *op, length); + + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref)); + + /* Encode MatchLength */ + len = (int)(ml - MINMATCH); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *(*op)++ = 255; + *(*op)++ = 255; + } + if (len > 254) { + len -= 255; + *(*op)++ = 255; + } + *(*op)++ = (u8)len; + } else + *token += len; + + /* Prepare next loop */ + *ip += ml; + *anchor = *ip; + + return 0; +} + +int lz4_compresshcctx(struct lz4hc_data *ctx, + const char *source, + char *dest, + int isize) +{ + const u8 *ip = (const u8 *)source; + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + const u8 *const matchlimit = (iend - LASTLITERALS); + + u8 *op = (u8 *)dest; + + int ml, ml2, ml3, ml0; + const u8 *ref = NULL; + const u8 *start2 = NULL; + const u8 *ref2 = NULL; + const u8 *start3 = NULL; + const u8 *ref3 = NULL; + const u8 *start0; + const u8 *ref0; + int lastrun; + + ip++; + + /* Main Loop */ + while (ip < mflimit) { + ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + if (!ml) { + ip++; + continue; + } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; +_search2: + if (ip+ml < mflimit) + ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, + ip + 1, matchlimit, ml, &ref2, &start2); + else + ml2 = ml; + /* No better match */ + if (ml2 == ml) { + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + continue; + } + + if (start0 < ip) { + /* empirical */ + if (start2 < ip + ml0) { + ip = start0; + ref = ref0; + ml = ml0; + } + } + /* + * Here, start0==ip + * First Match too small : removed + */ + if ((start2 - ip) < 3) { + ml = ml2; + ip = start2; + ref = ref2; + goto _search2; + } + +_search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) + new_ml = OPTIMAL_ML; + if (ip + new_ml > start2 + ml2 - MINMATCH) + new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* + * Now, we have start2 = ip+new_ml, + * with new_ml=min(ml, OPTIMAL_ML=18) + */ + if (start2 + ml2 < mflimit) + ml3 = lz4hc_insertandgetwidermatch(ctx, + start2 + ml2 - 3, start2, matchlimit, + ml2, &ref3, &start3); + else + ml3 = ml2; + + /* No better match : 2 sequences to encode */ + if (ml3 == ml2) { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) + ml = (int)(start2 - ip); + + /* Now, encode 2 sequences */ + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start2; + lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + continue; + } + + /* Not enough space for match 2 : remove it */ + if (start3 < ip + ml + 3) { + /* + * can write Seq1 immediately ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ + if (start3 >= (ip + ml)) { + if (start2 < ip + ml) { + int correction = + (int)(ip + ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least + * the first one ip & ref are known; Now for ml + */ + if (start2 < ip + ml) { + if ((start2 - ip) < (int)ML_MASK) { + int correction; + if (ml > OPTIMAL_ML) + ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) + ml = (int)(start2 - ip) + ml2 + - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else + ml = (int)(start2 - ip); + } + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _search3; + } + + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8) lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int) (((char *)op) - dest); +} + +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; + lz4hc_init(hc4, (const u8 *)src); + out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, + (char *)dst, (int)src_len); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + return 0; + +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4hc_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC compressor"); -- cgit v1.2.3 From fbccc0f33a4fa1924e03eb021883996ee677b792 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:38:22 +1000 Subject: lib-add-lz4-compressor-module-fix make lz4_compresshcctx() static Cc: Chanho Min Signed-off-by: Andrew Morton --- lib/lz4/lz4hc_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c index a9a9c2a00c5c..eb1a74f5e368 100644 --- a/lib/lz4/lz4hc_compress.c +++ b/lib/lz4/lz4hc_compress.c @@ -314,7 +314,7 @@ static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, return 0; } -int lz4_compresshcctx(struct lz4hc_data *ctx, +static int lz4_compresshcctx(struct lz4hc_data *ctx, const char *source, char *dest, int isize) -- cgit v1.2.3 From 6ee7d4c77a0d0d0328ee13e4373d82670f0bd455 Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 23 May 2013 10:38:22 +1000 Subject: crypto: add lz4 Cryptographic API Add support for lz4 and lz4hc compression algorithm using the lib/lz4/* codebase. Signed-off-by: Chanho Min Cc: "Darrick J. Wong" Cc: Bob Pearson Cc: Richard Weinberger Cc: Herbert Xu Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton --- crypto/Kconfig | 16 +++++++++ crypto/Makefile | 2 ++ crypto/lz4.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ crypto/lz4hc.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 228 insertions(+) create mode 100644 crypto/lz4.c create mode 100644 crypto/lz4hc.c diff --git a/crypto/Kconfig b/crypto/Kconfig index d1ca6312d798..25b52098230a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1378,6 +1378,22 @@ config CRYPTO_842 help This is the 842 algorithm. +config CRYPTO_LZ4 + tristate "LZ4 compression algorithm" + select CRYPTO_ALGAPI + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 algorithm. + +config CRYPTO_LZ4HC + tristate "LZ4HC compression algorithm" + select CRYPTO_ALGAPI + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 high compression mode algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index 62af87df8729..2d5ed08a239f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -86,6 +86,8 @@ obj-$(CONFIG_CRYPTO_CRC32) += crc32.o obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o +obj-$(CONFIG_CRYPTO_LZ4) += lz4.o +obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o obj-$(CONFIG_CRYPTO_842) += 842.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o diff --git a/crypto/lz4.c b/crypto/lz4.c new file mode 100644 index 000000000000..98bfdd71e78f --- /dev/null +++ b/crypto/lz4.c @@ -0,0 +1,105 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +struct lz4_ctx { + void *lz4_comp_mem; +}; + +static int lz4_init(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS); + if (!ctx->lz4_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4_exit(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + vfree(ctx->lz4_comp_mem); +} + +static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + + err = lz4_decompress(src, &slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4 = { + .cra_name = "lz4", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list), + .cra_init = lz4_init, + .cra_exit = lz4_exit, + .cra_u = { .compress = { + .coa_compress = lz4_compress_crypto, + .coa_decompress = lz4_decompress_crypto } } +}; + +static int __init lz4_mod_init(void) +{ + return crypto_register_alg(&alg_lz4); +} + +static void __exit lz4_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4); +} + +module_init(lz4_mod_init); +module_exit(lz4_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Compression Algorithm"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c new file mode 100644 index 000000000000..d3c9625917cd --- /dev/null +++ b/crypto/lz4hc.c @@ -0,0 +1,105 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include +#include +#include +#include +#include + +struct lz4hc_ctx { + void *lz4hc_comp_mem; +}; + +static int lz4hc_init(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!ctx->lz4hc_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4hc_exit(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + vfree(ctx->lz4hc_comp_mem); +} + +static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + + err = lz4_decompress(src, &slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4hc = { + .cra_name = "lz4hc", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4hc_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list), + .cra_init = lz4hc_init, + .cra_exit = lz4hc_exit, + .cra_u = { .compress = { + .coa_compress = lz4hc_compress_crypto, + .coa_decompress = lz4hc_decompress_crypto } } +}; + +static int __init lz4hc_mod_init(void) +{ + return crypto_register_alg(&alg_lz4hc); +} + +static void __exit lz4hc_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4hc); +} + +module_init(lz4hc_mod_init); +module_exit(lz4hc_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); -- cgit v1.2.3 From d376d4f754133c537fa796dcad7bfbf73fcaf090 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 May 2013 10:38:23 +1000 Subject: crypto-add-lz4-cryptographic-api-fix fix warnings Reported-by: Wu Fengguang Cc: "Darrick J. Wong" Cc: Bob Pearson Cc: Chanho Min Cc: Herbert Xu Cc: Kyungsik Lee Cc: Richard Weinberger Cc: Yann Collet Signed-off-by: Andrew Morton --- crypto/lz4.c | 3 ++- crypto/lz4hc.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crypto/lz4.c b/crypto/lz4.c index 98bfdd71e78f..4586dd15b0d8 100644 --- a/crypto/lz4.c +++ b/crypto/lz4.c @@ -66,8 +66,9 @@ static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, { int err; size_t tmp_len = *dlen; + size_t __slen = slen; - err = lz4_decompress(src, &slen, dst, tmp_len); + err = lz4_decompress(src, &__slen, dst, tmp_len); if (err < 0) return -EINVAL; diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c index d3c9625917cd..151ba31d34e3 100644 --- a/crypto/lz4hc.c +++ b/crypto/lz4hc.c @@ -66,8 +66,9 @@ static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, { int err; size_t tmp_len = *dlen; + size_t __slen = slen; - err = lz4_decompress(src, &slen, dst, tmp_len); + err = lz4_decompress(src, &__slen, dst, tmp_len); if (err < 0) return -EINVAL; -- cgit v1.2.3 From 21000fa39243916f8698b97ce0514a955dabf264 Mon Sep 17 00:00:00 2001 From: Daniel Tang Date: Thu, 23 May 2013 10:38:23 +1000 Subject: scripts/sortextable.c: fix building on non-Linux systems scripts/sortextable.c fails to compile on non-Linux systems due to the missing 'linux/types.h' header. Unless I'm missing something obvious, including the standard 'inttypes.h' header instead and using uintX_t types instead of __uX types does the exact same job and doesn't break compilation on non-Linux systems. Signed-off-by: Daniel Tang Cc: Matt Fleming Signed-off-by: Andrew Morton --- tools/include/tools/be_byteshift.h | 34 +++++++++++++++++----------------- tools/include/tools/le_byteshift.h | 34 +++++++++++++++++----------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/tools/include/tools/be_byteshift.h b/tools/include/tools/be_byteshift.h index f4912e2668ba..d51fe267cce8 100644 --- a/tools/include/tools/be_byteshift.h +++ b/tools/include/tools/be_byteshift.h @@ -1,68 +1,68 @@ #ifndef _TOOLS_BE_BYTESHIFT_H #define _TOOLS_BE_BYTESHIFT_H -#include +#include -static inline __u16 __get_unaligned_be16(const __u8 *p) +static inline uint16_t __get_unaligned_be16(const uint8_t *p) { return p[0] << 8 | p[1]; } -static inline __u32 __get_unaligned_be32(const __u8 *p) +static inline uint32_t __get_unaligned_be32(const uint8_t *p) { return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3]; } -static inline __u64 __get_unaligned_be64(const __u8 *p) +static inline uint64_t __get_unaligned_be64(const uint8_t *p) { - return (__u64)__get_unaligned_be32(p) << 32 | + return (uint64_t)__get_unaligned_be32(p) << 32 | __get_unaligned_be32(p + 4); } -static inline void __put_unaligned_be16(__u16 val, __u8 *p) +static inline void __put_unaligned_be16(uint16_t val, uint8_t *p) { *p++ = val >> 8; *p++ = val; } -static inline void __put_unaligned_be32(__u32 val, __u8 *p) +static inline void __put_unaligned_be32(uint32_t val, uint8_t *p) { __put_unaligned_be16(val >> 16, p); __put_unaligned_be16(val, p + 2); } -static inline void __put_unaligned_be64(__u64 val, __u8 *p) +static inline void __put_unaligned_be64(uint64_t val, uint8_t *p) { __put_unaligned_be32(val >> 32, p); __put_unaligned_be32(val, p + 4); } -static inline __u16 get_unaligned_be16(const void *p) +static inline uint16_t get_unaligned_be16(const void *p) { - return __get_unaligned_be16((const __u8 *)p); + return __get_unaligned_be16((const uint8_t *)p); } -static inline __u32 get_unaligned_be32(const void *p) +static inline uint32_t get_unaligned_be32(const void *p) { - return __get_unaligned_be32((const __u8 *)p); + return __get_unaligned_be32((const uint8_t *)p); } -static inline __u64 get_unaligned_be64(const void *p) +static inline uint64_t get_unaligned_be64(const void *p) { - return __get_unaligned_be64((const __u8 *)p); + return __get_unaligned_be64((const uint8_t *)p); } -static inline void put_unaligned_be16(__u16 val, void *p) +static inline void put_unaligned_be16(uint16_t val, void *p) { __put_unaligned_be16(val, p); } -static inline void put_unaligned_be32(__u32 val, void *p) +static inline void put_unaligned_be32(uint32_t val, void *p) { __put_unaligned_be32(val, p); } -static inline void put_unaligned_be64(__u64 val, void *p) +static inline void put_unaligned_be64(uint64_t val, void *p) { __put_unaligned_be64(val, p); } diff --git a/tools/include/tools/le_byteshift.h b/tools/include/tools/le_byteshift.h index c99d45a68bda..259a132046ff 100644 --- a/tools/include/tools/le_byteshift.h +++ b/tools/include/tools/le_byteshift.h @@ -1,68 +1,68 @@ #ifndef _TOOLS_LE_BYTESHIFT_H #define _TOOLS_LE_BYTESHIFT_H -#include +#include -static inline __u16 __get_unaligned_le16(const __u8 *p) +static inline uint16_t __get_unaligned_le16(const uint8_t *p) { return p[0] | p[1] << 8; } -static inline __u32 __get_unaligned_le32(const __u8 *p) +static inline uint32_t __get_unaligned_le32(const uint8_t *p) { return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24; } -static inline __u64 __get_unaligned_le64(const __u8 *p) +static inline uint64_t __get_unaligned_le64(const uint8_t *p) { - return (__u64)__get_unaligned_le32(p + 4) << 32 | + return (uint64_t)__get_unaligned_le32(p + 4) << 32 | __get_unaligned_le32(p); } -static inline void __put_unaligned_le16(__u16 val, __u8 *p) +static inline void __put_unaligned_le16(uint16_t val, uint8_t *p) { *p++ = val; *p++ = val >> 8; } -static inline void __put_unaligned_le32(__u32 val, __u8 *p) +static inline void __put_unaligned_le32(uint32_t val, uint8_t *p) { __put_unaligned_le16(val >> 16, p + 2); __put_unaligned_le16(val, p); } -static inline void __put_unaligned_le64(__u64 val, __u8 *p) +static inline void __put_unaligned_le64(uint64_t val, uint8_t *p) { __put_unaligned_le32(val >> 32, p + 4); __put_unaligned_le32(val, p); } -static inline __u16 get_unaligned_le16(const void *p) +static inline uint16_t get_unaligned_le16(const void *p) { - return __get_unaligned_le16((const __u8 *)p); + return __get_unaligned_le16((const uint8_t *)p); } -static inline __u32 get_unaligned_le32(const void *p) +static inline uint32_t get_unaligned_le32(const void *p) { - return __get_unaligned_le32((const __u8 *)p); + return __get_unaligned_le32((const uint8_t *)p); } -static inline __u64 get_unaligned_le64(const void *p) +static inline uint64_t get_unaligned_le64(const void *p) { - return __get_unaligned_le64((const __u8 *)p); + return __get_unaligned_le64((const uint8_t *)p); } -static inline void put_unaligned_le16(__u16 val, void *p) +static inline void put_unaligned_le16(uint16_t val, void *p) { __put_unaligned_le16(val, p); } -static inline void put_unaligned_le32(__u32 val, void *p) +static inline void put_unaligned_le32(uint32_t val, void *p) { __put_unaligned_le32(val, p); } -static inline void put_unaligned_le64(__u64 val, void *p) +static inline void put_unaligned_le64(uint64_t val, void *p) { __put_unaligned_le64(val, p); } -- cgit v1.2.3