From 2bd3a997befc226ab4b504f05c5cbba305f3e0e6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 23:53:19 -0800 Subject: init: Open /dev/console from rootfs To avoid potential problems with an empty /dev open /dev/console from rootfs instead of waiting to mount our root filesystem and mounting it there. This effectively guarantees that there will be a device node, and it won't be on a filesystem that we will ever unmount, so there are no issues with leaving /dev/console open and pinning the filesystem. This is actually more effective than automatically mounting devtmpfs on /dev because it removes removes the occasionally problematic assumption that /dev/console exists from the boot code. With this patch I was able to throw busybox on my /boot partition (which has no /dev directory) and boot into userspace without problems. The only possible negative consequence I can think of is that someone out there deliberately used did not use a character device that is major 5 minor 2 for /dev/console. Does anyone know of a situation in which that could make sense? Signed-off-by: Eric W. Biederman Signed-off-by: Al Viro --- init/main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 4cb47a159f02..106e02d7ffa5 100644 --- a/init/main.c +++ b/init/main.c @@ -806,11 +806,6 @@ static noinline int init_post(void) system_state = SYSTEM_RUNNING; numa_default_policy(); - if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) - printk(KERN_WARNING "Warning: unable to open an initial console.\n"); - - (void) sys_dup(0); - (void) sys_dup(0); current->signal->flags |= SIGNAL_UNKILLABLE; @@ -873,6 +868,12 @@ static int __init kernel_init(void * unused) do_basic_setup(); + /* Open the /dev/console on the rootfs, this should never fail */ + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + printk(KERN_WARNING "Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); /* * check if there is an early userspace init. If yes, let it do all * the work -- cgit v1.2.3 From 452aa6999e6703ffbddd7f6ea124d3968915f3e3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Mar 2010 13:42:13 -0800 Subject: mm/pm: force GFP_NOIO during suspend/hibernation and resume There are quite a few GFP_KERNEL memory allocations made during suspend/hibernation and resume that may cause the system to hang, because the I/O operations they depend on cannot be completed due to the underlying devices being suspended. Avoid this problem by clearing the __GFP_IO and __GFP_FS bits in gfp_allowed_mask before suspend/hibernation and restoring the original values of these bits in gfp_allowed_mask durig the subsequent resume. [akpm@linux-foundation.org: fix CONFIG_PM=n linkage] Signed-off-by: Rafael J. Wysocki Reported-by: Maxim Levitsky Cc: Sebastian Ott Cc: Benjamin Herrenschmidt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 7 +++---- init/main.c | 2 +- kernel/power/hibernate.c | 9 +++++++++ kernel/power/suspend.c | 3 +++ mm/page_alloc.c | 25 +++++++++++++++++++++++++ 5 files changed, 41 insertions(+), 5 deletions(-) (limited to 'init/main.c') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e5567e6762f3..2e1b32c0484d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -83,6 +83,7 @@ struct vm_area_struct; #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) +#define GFP_IOFS (__GFP_IO | __GFP_FS) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) @@ -337,9 +338,7 @@ void drain_local_pages(void *dummy); extern gfp_t gfp_allowed_mask; -static inline void set_gfp_allowed_mask(gfp_t mask) -{ - gfp_allowed_mask = mask; -} +extern void set_gfp_allowed_mask(gfp_t mask); +extern gfp_t clear_gfp_allowed_mask(gfp_t mask); #endif /* __LINUX_GFP_H */ diff --git a/init/main.c b/init/main.c index 40aaa020cd68..41d0f10dbbc7 100644 --- a/init/main.c +++ b/init/main.c @@ -618,7 +618,7 @@ asmlinkage void __init start_kernel(void) local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ - set_gfp_allowed_mask(__GFP_BITS_MASK); + gfp_allowed_mask = __GFP_BITS_MASK; kmem_cache_init_late(); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bbfe472d7524..da5288ec2392 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -323,6 +323,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; + gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: platform_end(platform_mode); @@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; + gfp_t saved_mask; pm_prepare_console(); suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } + set_gfp_allowed_mask(saved_mask); resume_console(); pm_restore_console(); return error; @@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; + gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -481,6 +488,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -518,6 +526,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6f10dfc2d3e9..44cce10b582d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: if (suspend_ops->end) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0734bedabd9c..298f307c63a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -76,6 +76,31 @@ unsigned long totalreserve_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with pm_mutex held (gfp_allowed_mask also should + * only be modified with pm_mutex held, unless the suspend/hibernate code is + * guaranteed not to run in parallel with that modification). + */ +void set_gfp_allowed_mask(gfp_t mask) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask = mask; +} + +gfp_t clear_gfp_allowed_mask(gfp_t mask) +{ + gfp_t ret = gfp_allowed_mask; + + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask &= ~mask; + return ret; +} +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; #endif -- cgit v1.2.3 From 9a85b8d6049cbb0e7961df2069322fbc4192026a Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Fri, 5 Mar 2010 13:42:39 -0800 Subject: init/main.c: improve usability in case of init binary failure - new Documentation/init.txt file describing various forms of failure trying to load the init binary after kernel bootup - extend the init/main.c init failure message to direct to Documentation/init.txt Signed-off-by: Andreas Mohr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/init.txt | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ init/main.c | 3 ++- 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 Documentation/init.txt (limited to 'init/main.c') diff --git a/Documentation/init.txt b/Documentation/init.txt new file mode 100644 index 000000000000..535ad5e82b98 --- /dev/null +++ b/Documentation/init.txt @@ -0,0 +1,49 @@ +Explaining the dreaded "No init found." boot hang message +========================================================= + +OK, so you've got this pretty unintuitive message (currently located +in init/main.c) and are wondering what the H*** went wrong. +Some high-level reasons for failure (listed roughly in order of execution) +to load the init binary are: +A) Unable to mount root FS +B) init binary doesn't exist on rootfs +C) broken console device +D) binary exists but dependencies not available +E) binary cannot be loaded + +Detailed explanations: +0) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE) + to get more detailed kernel messages. +A) make sure you have the correct root FS type + (and root= kernel parameter points to the correct partition), + required drivers such as storage hardware (such as SCSI or USB!) + and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules, + to be pre-loaded by an initrd) +C) Possibly a conflict in console= setup --> initial console unavailable. + E.g. some serial consoles are unreliable due to serial IRQ issues (e.g. + missing interrupt-based configuration). + Try using a different console= device or e.g. netconsole= . +D) e.g. required library dependencies of the init binary such as + /lib/ld-linux.so.2 missing or broken. Use readelf -d |grep NEEDED + to find out which libraries are required. +E) make sure the binary's architecture matches your hardware. + E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware. + In case you tried loading a non-binary file here (shell script?), + you should make sure that the script specifies an interpreter in its shebang + header line (#!/...) that is fully working (including its library + dependencies). And before tackling scripts, better first test a simple + non-script binary such as /bin/sh and confirm its successful execution. + To find out more, add code to init/main.c to display kernel_execve()s + return values. + +Please extend this explanation whenever you find new failure causes +(after all loading the init binary is a CRITICAL and hard transition step +which needs to be made as painless as possible), then submit patch to LKML. +Further TODOs: +- Implement the various run_init_process() invocations via a struct array + which can then store the kernel_execve() result value and on failure + log it all by iterating over _all_ results (very important usability fix). +- try to make the implementation itself more helpful in general, + e.g. by providing additional error messages at affected places. + +Andreas Mohr diff --git a/init/main.c b/init/main.c index 41d0f10dbbc7..b09a828f38b5 100644 --- a/init/main.c +++ b/init/main.c @@ -847,7 +847,8 @@ static noinline int init_post(void) run_init_process("/bin/init"); run_init_process("/bin/sh"); - panic("No init found. Try passing init= option to kernel."); + panic("No init found. Try passing init= option to kernel. " + "See Linux Documentation/init.txt for guidance."); } static int __init kernel_init(void * unused) -- cgit v1.2.3 From 7463e633c5f94792dcff1afefb0d2961318a9d09 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 5 Mar 2010 13:42:46 -0800 Subject: init/main.c: make setup_max_cpus static for !SMP The only in tree external users of the symbol setup_max_cpus are in arch/x86/. The files ./kernel/alternative.c, ./kernel/visws_quirks.c, and ./mm/kmemcheck/kmemcheck.c are all guarded by CONFIG_SMP being defined. For this case the symbol is an unsigned int and declared as an extern in include/linux/smp.h. When CONFIG_SMP is not defined the symbol setup_max_cpus is a constant value that is only used in init/main.c. Make the symbol static for this case. Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index b09a828f38b5..a1ab78ceb4b6 100644 --- a/init/main.c +++ b/init/main.c @@ -174,7 +174,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); #else -const unsigned int setup_max_cpus = NR_CPUS; +static const unsigned int setup_max_cpus = NR_CPUS; #endif /* -- cgit v1.2.3 From 5ab116c9349ef52d6fbd2e2917a53f13194b048e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 23 Mar 2010 13:35:34 -0700 Subject: cpuset: fix the problem that cpuset_mem_spread_node() returns an offline node cpuset_mem_spread_node() returns an offline node, and causes an oops. This patch fixes it by initializing task->mems_allowed to node_states[N_HIGH_MEMORY], and updating task->mems_allowed when doing memory hotplug. Signed-off-by: Miao Xie Acked-by: David Rientjes Reported-by: Nick Piggin Tested-by: Nick Piggin Cc: Paul Menage Cc: Li Zefan Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- kernel/cpuset.c | 20 ++++++++++++-------- kernel/kthread.c | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index a1ab78ceb4b6..cbead27caefc 100644 --- a/init/main.c +++ b/init/main.c @@ -858,7 +858,7 @@ static int __init kernel_init(void * unused) /* * init can allocate pages on any node */ - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); /* * init can run on any cpu. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fab459f..5d38bd74483c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -1391,11 +1388,10 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); - to = node_possible_map; } else { guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &to); } + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ cpuset_attach_task(tsk, &to, cs); @@ -2090,15 +2086,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { + nodemask_t oldmems; + cgroup_lock(); switch (action) { case MEM_ONLINE: - case MEM_OFFLINE: + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - if (action == MEM_OFFLINE) - scan_for_empty_cpusets(&top_cpuset); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); break; default: break; diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea15194..83911c780175 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; -- cgit v1.2.3