From c41b93fb8551148a93d3bba870365e8489317f02 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Feb 2011 23:41:35 +0100
Subject: ACPI / PM: Drop acpi_restore_state_mem()

The function acpi_restore_state_mem() has never been and most likely
never will be used, so remove it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/kernel/acpi/sleep.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 68d1537b8c81..c27a483094b1 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -110,14 +110,6 @@ int acpi_save_state_mem(void)
 	return 0;
 }
 
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-
 /**
  * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
  *
-- 
cgit v1.2.3


From f1a2003e22f6b50ea21f7f4b38b38c5ebc9c8017 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Feb 2011 23:42:22 +0100
Subject: ACPI / PM: Merge do_suspend_lowlevel() into acpi_save_state_mem()

The function do_suspend_lowlevel() is specific to x86 and defined in
assembly code, so it should be called from the x86 low-level suspend
code rather than from acpi_suspend_enter().

Merge do_suspend_lowlevel() into the x86's acpi_save_state_mem() and
change the name of the latter to acpi_suspend_lowlevel(), so that the
function's purpose is better reflected by its name.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/ia64/include/asm/acpi.h | 4 ++--
 arch/ia64/kernel/acpi.c      | 9 ++-------
 arch/x86/include/asm/acpi.h  | 4 ++--
 arch/x86/kernel/acpi/sleep.c | 5 +++--
 arch/x86/kernel/acpi/sleep.h | 2 ++
 drivers/acpi/sleep.c         | 5 +----
 6 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index db3986a71af8..a06dfb13d518 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -128,8 +128,8 @@ static inline const char *acpi_get_sysname (void)
 int acpi_request_vector (u32 int_type);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
-/* Routine for saving kernel state during suspend. */
-extern int acpi_save_state_mem(void);
+/* Low-level suspend routine. */
+extern int acpi_suspend_lowlevel(void);
 
 extern unsigned long acpi_wakeup_address;
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 7cf2d10c0677..a54d054ed4b0 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -1034,13 +1034,8 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
 EXPORT_SYMBOL(acpi_unregister_ioapic);
 
 /*
- * acpi_save_state_mem() - save kernel state
+ * acpi_suspend_lowlevel() - save kernel state and suspend.
  *
  * TBD when when IA64 starts to support suspend...
  */
-int acpi_save_state_mem(void) { return 0; } 
-
-/*
- * do_suspend_lowlevel()
- */
-void do_suspend_lowlevel(void) {}
+int acpi_suspend_lowlevel(void) { return 0; }
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 47981f0d121b..aa92684aa674 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -112,8 +112,8 @@ static inline void acpi_disable_pci(void)
 	acpi_noirq_set();
 }
 
-/* Routine for saving kernel state during suspend. */
-extern int acpi_save_state_mem(void);
+/* Low-level suspend routine. */
+extern int acpi_suspend_lowlevel(void);
 
 extern unsigned long acpi_wakeup_address;
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index c27a483094b1..5f1b747f6ef1 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -29,14 +29,14 @@ static char temp_stack[4096];
 #endif
 
 /**
- * acpi_save_state_mem - save kernel state
+ * acpi_suspend_lowlevel - save kernel state
  *
  * Create an identity mapped page table and copy the wakeup routine to
  * low memory.
  *
  * Note that this is too late to change acpi_wakeup_address.
  */
-int acpi_save_state_mem(void)
+int acpi_suspend_lowlevel(void)
 {
 	struct wakeup_header *header;
 
@@ -107,6 +107,7 @@ int acpi_save_state_mem(void)
        saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
+	do_suspend_lowlevel();
 	return 0;
 }
 
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..31ce13f20297 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -14,3 +14,5 @@ extern char swsusp_pg_dir[PAGE_SIZE];
 
 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 extern void wakeup_long64(void);
+
+extern void do_suspend_lowlevel(void);
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 2307604064b3..0a81bf11f473 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -199,8 +199,6 @@ static void acpi_pm_end(void)
 #endif /* CONFIG_ACPI_SLEEP */
 
 #ifdef CONFIG_SUSPEND
-extern void do_suspend_lowlevel(void);
-
 static u32 acpi_suspend_states[] = {
 	[PM_SUSPEND_ON] = ACPI_STATE_S0,
 	[PM_SUSPEND_STANDBY] = ACPI_STATE_S1,
@@ -255,10 +253,9 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 		break;
 
 	case ACPI_STATE_S3:
-		error = acpi_save_state_mem();
+		error = acpi_suspend_lowlevel();
 		if (error)
 			return error;
-		do_suspend_lowlevel();
 		pr_info(PREFIX "Low-level resume complete\n");
 		break;
 	}
-- 
cgit v1.2.3


From fc66c5210ec2539e800e87d7b3a985323c7be96e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Sat, 19 Mar 2011 18:20:05 +0100
Subject: perf, x86: Fix Intel fixed counters base initialization

The following patch solves the problems introduced by Robert's
commit 41bf498 and reported by Arun Sharma. This commit gets rid
of the base + index notation for reading and writing PMU msrs.

The problem is that for fixed counters, the new calculation for
the base did not take into account the fixed counter indexes,
thus all fixed counters were read/written from fixed counter 0.
Although all fixed counters share the same config MSR, they each
have their own counter register.

Without:

 $ task -e unhalted_core_cycles -e instructions_retired -e baclears noploop 1 noploop for 1 seconds

  242202299 unhalted_core_cycles (0.00% scaling, ena=1000790892, run=1000790892)
 2389685946 instructions_retired (0.00% scaling, ena=1000790892, run=1000790892)
      49473 baclears             (0.00% scaling, ena=1000790892, run=1000790892)

With:

 $ task -e unhalted_core_cycles -e instructions_retired -e baclears noploop 1 noploop for 1 seconds

 2392703238 unhalted_core_cycles (0.00% scaling, ena=1000840809, run=1000840809)
 2389793744 instructions_retired (0.00% scaling, ena=1000840809, run=1000840809)
      47863 baclears             (0.00% scaling, ena=1000840809, run=1000840809)

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: ming.m.lin@intel.com
Cc: robert.richter@amd.com
Cc: asharma@fb.com
Cc: perfmon2-devel@lists.sf.net
LKML-Reference: <20110319172005.GB4978@quad>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e8dbe179587f..ec46eea0c4ed 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -912,7 +912,7 @@ static inline void x86_assign_hw_event(struct perf_event *event,
 		hwc->event_base	= 0;
 	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0;
+		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
 	} else {
 		hwc->config_base = x86_pmu_config_addr(hwc->idx);
 		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-- 
cgit v1.2.3


From e5f15b45ddf3afa2bbbb10c7ea34fb32b6de0a0e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 18 Feb 2011 11:30:30 +0000
Subject: x86: Cleanup highmap after brk is concluded

Now cleanup_highmap actually is in two steps: one is early in head64.c
and only clears above _end; a second one is in init_memory_mapping() and
tries to clean from _brk_end to _end.
It should check if those boundaries are PMD_SIZE aligned but currently
does not.
Also init_memory_mapping() is called several times for numa or memory
hotplug, so we really should not handle initial kernel mappings there.

This patch moves cleanup_highmap() down after _brk_end is settled so
we can do everything in one step.
Also we honor max_pfn_mapped in the implementation of cleanup_highmap.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
LKML-Reference: <alpine.DEB.2.00.1103171739050.3382@kaball-desktop>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head64.c |  3 ---
 arch/x86/kernel/setup.c  | 25 +++----------------------
 arch/x86/mm/init_64.c    | 11 ++++++-----
 3 files changed, 9 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d2673c28aff..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* Make NULL pointers segfault */
 	zap_identity_mappings();
 
-	/* Cleanup the over mapped high alias */
-	cleanup_highmap();
-
 	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b176f2b1f45d..4a52a5f9afcb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -294,30 +294,11 @@ static void __init init_gbpages(void)
 	else
 		direct_gbpages = 0;
 }
-
-static void __init cleanup_highmap_brk_end(void)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-
-	mmu_cr4_features = read_cr4();
-
-	/*
-	 * _brk_end cannot change anymore, but it and _end may be
-	 * located on different 2M pages. cleanup_highmap(), however,
-	 * can only consider _end when it runs, so destroy any
-	 * mappings beyond _brk_end here.
-	 */
-	pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-	pmd = pmd_offset(pud, _brk_end - 1);
-	while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-		pmd_clear(pmd);
-}
 #else
 static inline void init_gbpages(void)
 {
 }
-static inline void cleanup_highmap_brk_end(void)
+static void __init cleanup_highmap(void)
 {
 }
 #endif
@@ -330,8 +311,6 @@ static void __init reserve_brk(void)
 	/* Mark brk area as locked down and no longer taking any
 	   new allocations */
 	_brk_start = 0;
-
-	cleanup_highmap_brk_end();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -950,6 +929,8 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	reserve_brk();
 
+	cleanup_highmap();
+
 	memblock.current_limit = get_max_mapped();
 	memblock_x86_fill();
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a08a62cb136e..7026505a33ba 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,6 +51,7 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
+#include <asm/setup.h>
 
 static int __init parse_direct_gbpages_off(char *arg)
 {
@@ -293,18 +294,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
  * to the compile time generated pmds. This results in invalid pmds up
  * to the point where we hit the physaddr 0 mapping.
  *
- * We limit the mappings to the region from _text to _end.  _end is
- * rounded up to the 2MB boundary. This catches the invalid pmds as
+ * We limit the mappings to the region from _text to _brk_end.  _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
  * well, as they are located before _text:
  */
 void __init cleanup_highmap(void)
 {
 	unsigned long vaddr = __START_KERNEL_map;
-	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
+	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
 	pmd_t *pmd = level2_kernel_pgt;
-	pmd_t *last_pmd = pmd + PTRS_PER_PMD;
 
-	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
 		if (pmd_none(*pmd))
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
-- 
cgit v1.2.3


From b7ed78f56575074f29ec99d8984f347f6c99c914 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 10 Mar 2011 11:31:30 -0800
Subject: introduce sys_syncfs to sync a single file system

It is frequently useful to sync a single file system, instead of all
mounted file systems via sync(2):

 - On machines with many mounts, it is not at all uncommon for some of
   them to hang (e.g. unresponsive NFS server).  sync(2) will get stuck on
   those and may never get to the one you do care about (e.g., /).
 - Some applications write lots of data to the file system and then
   want to make sure it is flushed to disk.  Calling fsync(2) on each
   file introduces unnecessary ordering constraints that result in a large
   amount of sub-optimal writeback/flush/commit behavior by the file
   system.

There are currently two ways (that I know of) to sync a single super_block:

 - BLKFLSBUF ioctl on the block device: That also invalidates the bdev
   mapping, which isn't usually desirable, and doesn't work for non-block
   file systems.
 - 'mount -o remount,rw' will call sync_filesystem as an artifact of the
   current implemention.  Relying on this little-known side effect for
   something like data safety sounds foolish.

Both of these approaches require root privileges, which some applications
do not have (nor should they need?) given that sync(2) is an unprivileged
operation.

This patch introduces a new system call syncfs(2) that takes an fd and
syncs only the file system it references.  Maybe someday we can

 $ sync /some/path

and not get

 sync: ignoring all arguments

The syscall is motivated by comments by Al and Christoph at the last LSF.
syncfs(2) seems like an appropriate name given statfs(2).

A similar ioctl was also proposed a while back, see
	http://marc.info/?l=linux-fsdevel&m=127970513829285&w=2

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/include/asm/unistd_32.h   |  3 ++-
 arch/x86/include/asm/unistd_64.h   |  2 ++
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/sync.c                          | 24 ++++++++++++++++++++++++
 include/asm-generic/unistd.h       |  4 +++-
 include/linux/syscalls.h           |  1 +
 7 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 430312ba6e3f..849a9d23c71d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -847,4 +847,5 @@ ia32_sys_call_table:
 	.quad sys_name_to_handle_at
 	.quad compat_sys_open_by_handle_at
 	.quad compat_sys_clock_adjtime
+	.quad sys_syncfs
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index ffaf183c619a..a755ef5e5977 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -349,10 +349,11 @@
 #define __NR_name_to_handle_at	341
 #define __NR_open_by_handle_at  342
 #define __NR_clock_adjtime	343
+#define __NR_syncfs             344
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 344
+#define NR_syscalls 345
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 5466bea670e7..160fa76bd578 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -675,6 +675,8 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 #define __NR_clock_adjtime			305
 __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
+#define __NR_syncfs                             306
+__SYSCALL(__NR_syncfs, sys_syncfs)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 5f181742e8f9..abce34d5c79d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -343,3 +343,4 @@ ENTRY(sys_call_table)
 	.long sys_name_to_handle_at
 	.long sys_open_by_handle_at
 	.long sys_clock_adjtime
+	.long sys_syncfs
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7e..92ca208777d5 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
@@ -128,6 +129,29 @@ void emergency_sync(void)
 	}
 }
 
+/*
+ * sync a single super
+ */
+SYSCALL_DEFINE1(syncfs, int, fd)
+{
+	struct file *file;
+	struct super_block *sb;
+	int ret;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	sb = file->f_dentry->d_sb;
+
+	down_read(&sb->s_umount);
+	ret = sync_filesystem(sb);
+	up_read(&sb->s_umount);
+
+	fput_light(file, fput_needed);
+	return ret;
+}
+
 /**
  * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index d94f447c667a..176b825add52 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -652,9 +652,11 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 #define __NR_clock_adjtime 266
 __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
+#define __NR_syncfs 264
+__SYSCALL(__NR_syncfs, sys_syncfs)
 
 #undef __NR_syscalls
-#define __NR_syscalls 267
+#define __NR_syscalls 268
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1f5c18e6f4f1..83ecc1749ef6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -825,6 +825,7 @@ asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags
 asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
 				  u64 mask, int fd,
 				  const char  __user *pathname);
+asmlinkage long sys_syncfs(int fd);
 
 int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]);
 
-- 
cgit v1.2.3


From 885b976fada5bc6595a9fd3e67e3cb1a3d11f50b Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 21 Feb 2011 13:54:41 +0800
Subject: ACPI, APEI, Add ERST record ID cache

APEI ERST firmware interface and implementation has no multiple users
in mind.  For example, if there is four records in storage with ID: 1,
2, 3 and 4, if two ERST readers enumerate the records via
GET_NEXT_RECORD_ID as follow,

reader 1		reader 2
1
			2
3
			4
-1
			-1

where -1 signals there is no more record ID.

Reader 1 has no chance to check record 2 and 4, while reader 2 has no
chance to check record 1 and 3.  And any other GET_NEXT_RECORD_ID will
return -1, that is, other readers will has no chance to check any
record even they are not cleared by anyone.

This makes raw GET_NEXT_RECORD_ID not suitable for used by multiple
users.

To solve the issue, an in-memory ERST record ID cache is designed and
implemented.  When enumerating record ID, the ID returned by
GET_NEXT_RECORD_ID is added into cache in addition to be returned to
caller.  So other readers can check the cache to get all record ID
available.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c |  42 +++---
 drivers/acpi/apei/erst-dbg.c          |  24 +++-
 drivers/acpi/apei/erst.c              | 235 +++++++++++++++++++++++++++-------
 include/acpi/apei.h                   |   5 +-
 4 files changed, 241 insertions(+), 65 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
 ssize_t apei_read_mce(struct mce *m, u64 *record_id)
 {
 	struct cper_mce_record rcd;
-	ssize_t len;
-
-	len = erst_read_next(&rcd.hdr, sizeof(rcd));
-	if (len <= 0)
-		return len;
-	/* Can not skip other records in storage via ERST unless clear them */
-	else if (len != sizeof(rcd) ||
-		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
-		if (printk_ratelimit())
-			pr_warning(
-			"MCE-APEI: Can not skip the unknown record in ERST");
-		return -EIO;
-	}
-
+	int rc, pos;
+
+	rc = erst_get_record_id_begin(&pos);
+	if (rc)
+		return rc;
+retry:
+	rc = erst_get_record_id_next(&pos, record_id);
+	if (rc)
+		goto out;
+	/* no more record */
+	if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+		goto out;
+	rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+	/* someone else has cleared the record, try next one */
+	if (rc == -ENOENT)
+		goto retry;
+	else if (rc < 0)
+		goto out;
+	/* try to skip other type records in storage */
+	else if (rc != sizeof(rcd) ||
+		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+		goto retry;
 	memcpy(m, &rcd.mce, sizeof(*m));
-	*record_id = rcd.hdr.record_id;
+	rc = sizeof(*m);
+out:
+	erst_get_record_id_end();
 
-	return sizeof(*m);
+	return rc;
 }
 
 /* Check whether there is record in ERST */
diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
index de73caf3cebc..a4cfb64c86a1 100644
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -43,12 +43,27 @@ static DEFINE_MUTEX(erst_dbg_mutex);
 
 static int erst_dbg_open(struct inode *inode, struct file *file)
 {
+	int rc, *pos;
+
 	if (erst_disable)
 		return -ENODEV;
 
+	pos = (int *)&file->private_data;
+
+	rc = erst_get_record_id_begin(pos);
+	if (rc)
+		return rc;
+
 	return nonseekable_open(inode, file);
 }
 
+static int erst_dbg_release(struct inode *inode, struct file *file)
+{
+	erst_get_record_id_end();
+
+	return 0;
+}
+
 static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 {
 	int rc;
@@ -79,18 +94,20 @@ static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 static ssize_t erst_dbg_read(struct file *filp, char __user *ubuf,
 			     size_t usize, loff_t *off)
 {
-	int rc;
+	int rc, *pos;
 	ssize_t len = 0;
 	u64 id;
 
-	if (*off != 0)
+	if (*off)
 		return -EINVAL;
 
 	if (mutex_lock_interruptible(&erst_dbg_mutex) != 0)
 		return -EINTR;
 
+	pos = (int *)&filp->private_data;
+
 retry_next:
-	rc = erst_get_next_record_id(&id);
+	rc = erst_get_record_id_next(pos, &id);
 	if (rc)
 		goto out;
 	/* no more record */
@@ -181,6 +198,7 @@ out:
 static const struct file_operations erst_dbg_ops = {
 	.owner		= THIS_MODULE,
 	.open		= erst_dbg_open,
+	.release	= erst_dbg_release,
 	.read		= erst_dbg_read,
 	.write		= erst_dbg_write,
 	.unlocked_ioctl	= erst_dbg_ioctl,
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index cf6db6b7662a..8ff8c32fef58 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -429,6 +429,22 @@ ssize_t erst_get_record_count(void)
 }
 EXPORT_SYMBOL_GPL(erst_get_record_count);
 
+#define ERST_RECORD_ID_CACHE_SIZE_MIN	16
+#define ERST_RECORD_ID_CACHE_SIZE_MAX	1024
+
+struct erst_record_id_cache {
+	struct mutex lock;
+	u64 *entries;
+	int len;
+	int size;
+	int refcount;
+};
+
+static struct erst_record_id_cache erst_record_id_cache = {
+	.lock = __MUTEX_INITIALIZER(erst_record_id_cache.lock),
+	.refcount = 0,
+};
+
 static int __erst_get_next_record_id(u64 *record_id)
 {
 	struct apei_exec_context ctx;
@@ -443,26 +459,179 @@ static int __erst_get_next_record_id(u64 *record_id)
 	return 0;
 }
 
+int erst_get_record_id_begin(int *pos)
+{
+	int rc;
+
+	if (erst_disable)
+		return -ENODEV;
+
+	rc = mutex_lock_interruptible(&erst_record_id_cache.lock);
+	if (rc)
+		return rc;
+	erst_record_id_cache.refcount++;
+	mutex_unlock(&erst_record_id_cache.lock);
+
+	*pos = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(erst_get_record_id_begin);
+
+/* erst_record_id_cache.lock must be held by caller */
+static int __erst_record_id_cache_add_one(void)
+{
+	u64 id, prev_id, first_id;
+	int i, rc;
+	u64 *entries;
+	unsigned long flags;
+
+	id = prev_id = first_id = APEI_ERST_INVALID_RECORD_ID;
+retry:
+	raw_spin_lock_irqsave(&erst_lock, flags);
+	rc = __erst_get_next_record_id(&id);
+	raw_spin_unlock_irqrestore(&erst_lock, flags);
+	if (rc == -ENOENT)
+		return 0;
+	if (rc)
+		return rc;
+	if (id == APEI_ERST_INVALID_RECORD_ID)
+		return 0;
+	/* can not skip current ID, or loop back to first ID */
+	if (id == prev_id || id == first_id)
+		return 0;
+	if (first_id == APEI_ERST_INVALID_RECORD_ID)
+		first_id = id;
+	prev_id = id;
+
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (entries[i] == id)
+			break;
+	}
+	/* record id already in cache, try next */
+	if (i < erst_record_id_cache.len)
+		goto retry;
+	if (erst_record_id_cache.len >= erst_record_id_cache.size) {
+		int new_size, alloc_size;
+		u64 *new_entries;
+
+		new_size = erst_record_id_cache.size * 2;
+		new_size = clamp_val(new_size, ERST_RECORD_ID_CACHE_SIZE_MIN,
+				     ERST_RECORD_ID_CACHE_SIZE_MAX);
+		if (new_size <= erst_record_id_cache.size) {
+			if (printk_ratelimit())
+				pr_warning(FW_WARN ERST_PFX
+					   "too many record ID!\n");
+			return 0;
+		}
+		alloc_size = new_size * sizeof(entries[0]);
+		if (alloc_size < PAGE_SIZE)
+			new_entries = kmalloc(alloc_size, GFP_KERNEL);
+		else
+			new_entries = vmalloc(alloc_size);
+		if (!new_entries)
+			return -ENOMEM;
+		memcpy(new_entries, entries,
+		       erst_record_id_cache.len * sizeof(entries[0]));
+		if (erst_record_id_cache.size < PAGE_SIZE)
+			kfree(entries);
+		else
+			vfree(entries);
+		erst_record_id_cache.entries = entries = new_entries;
+		erst_record_id_cache.size = new_size;
+	}
+	entries[i] = id;
+	erst_record_id_cache.len++;
+
+	return 1;
+}
+
 /*
  * Get the record ID of an existing error record on the persistent
  * storage. If there is no error record on the persistent storage, the
  * returned record_id is APEI_ERST_INVALID_RECORD_ID.
  */
-int erst_get_next_record_id(u64 *record_id)
+int erst_get_record_id_next(int *pos, u64 *record_id)
 {
-	int rc;
-	unsigned long flags;
+	int rc = 0;
+	u64 *entries;
 
 	if (erst_disable)
 		return -ENODEV;
 
-	raw_spin_lock_irqsave(&erst_lock, flags);
-	rc = __erst_get_next_record_id(record_id);
-	raw_spin_unlock_irqrestore(&erst_lock, flags);
+	/* must be enclosed by erst_get_record_id_begin/end */
+	BUG_ON(!erst_record_id_cache.refcount);
+	BUG_ON(*pos < 0 || *pos > erst_record_id_cache.len);
+
+	mutex_lock(&erst_record_id_cache.lock);
+	entries = erst_record_id_cache.entries;
+	for (; *pos < erst_record_id_cache.len; (*pos)++)
+		if (entries[*pos] != APEI_ERST_INVALID_RECORD_ID)
+			break;
+	/* found next record id in cache */
+	if (*pos < erst_record_id_cache.len) {
+		*record_id = entries[*pos];
+		(*pos)++;
+		goto out_unlock;
+	}
+
+	/* Try to add one more record ID to cache */
+	rc = __erst_record_id_cache_add_one();
+	if (rc < 0)
+		goto out_unlock;
+	/* successfully add one new ID */
+	if (rc == 1) {
+		*record_id = erst_record_id_cache.entries[*pos];
+		(*pos)++;
+		rc = 0;
+	} else {
+		*pos = -1;
+		*record_id = APEI_ERST_INVALID_RECORD_ID;
+	}
+out_unlock:
+	mutex_unlock(&erst_record_id_cache.lock);
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(erst_get_next_record_id);
+EXPORT_SYMBOL_GPL(erst_get_record_id_next);
+
+/* erst_record_id_cache.lock must be held by caller */
+static void __erst_record_id_cache_compact(void)
+{
+	int i, wpos = 0;
+	u64 *entries;
+
+	if (erst_record_id_cache.refcount)
+		return;
+
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (entries[i] == APEI_ERST_INVALID_RECORD_ID)
+			continue;
+		if (wpos != i)
+			memcpy(&entries[wpos], &entries[i], sizeof(entries[i]));
+		wpos++;
+	}
+	erst_record_id_cache.len = wpos;
+}
+
+void erst_get_record_id_end(void)
+{
+	/*
+	 * erst_disable != 0 should be detected by invoker via the
+	 * return value of erst_get_record_id_begin/next, so this
+	 * function should not be called for erst_disable != 0.
+	 */
+	BUG_ON(erst_disable);
+
+	mutex_lock(&erst_record_id_cache.lock);
+	erst_record_id_cache.refcount--;
+	BUG_ON(erst_record_id_cache.refcount < 0);
+	__erst_record_id_cache_compact();
+	mutex_unlock(&erst_record_id_cache.lock);
+}
+EXPORT_SYMBOL_GPL(erst_get_record_id_end);
 
 static int __erst_write_to_storage(u64 offset)
 {
@@ -703,56 +872,34 @@ ssize_t erst_read(u64 record_id, struct cper_record_header *record,
 }
 EXPORT_SYMBOL_GPL(erst_read);
 
-/*
- * If return value > buflen, the buffer size is not big enough,
- * else if return value = 0, there is no more record to read,
- * else if return value < 0, something goes wrong,
- * else everything is OK, and return value is record length
- */
-ssize_t erst_read_next(struct cper_record_header *record, size_t buflen)
-{
-	int rc;
-	ssize_t len;
-	unsigned long flags;
-	u64 record_id;
-
-	if (erst_disable)
-		return -ENODEV;
-
-	raw_spin_lock_irqsave(&erst_lock, flags);
-	rc = __erst_get_next_record_id(&record_id);
-	if (rc) {
-		raw_spin_unlock_irqrestore(&erst_lock, flags);
-		return rc;
-	}
-	/* no more record */
-	if (record_id == APEI_ERST_INVALID_RECORD_ID) {
-		raw_spin_unlock_irqrestore(&erst_lock, flags);
-		return 0;
-	}
-
-	len = __erst_read(record_id, record, buflen);
-	raw_spin_unlock_irqrestore(&erst_lock, flags);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(erst_read_next);
-
 int erst_clear(u64 record_id)
 {
-	int rc;
+	int rc, i;
 	unsigned long flags;
+	u64 *entries;
 
 	if (erst_disable)
 		return -ENODEV;
 
+	rc = mutex_lock_interruptible(&erst_record_id_cache.lock);
+	if (rc)
+		return rc;
 	raw_spin_lock_irqsave(&erst_lock, flags);
 	if (erst_erange.attr & ERST_RANGE_NVRAM)
 		rc = __erst_clear_from_nvram(record_id);
 	else
 		rc = __erst_clear_from_storage(record_id);
 	raw_spin_unlock_irqrestore(&erst_lock, flags);
-
+	if (rc)
+		goto out;
+	entries = erst_record_id_cache.entries;
+	for (i = 0; i < erst_record_id_cache.len; i++) {
+		if (entries[i] == record_id)
+			entries[i] = APEI_ERST_INVALID_RECORD_ID;
+	}
+	__erst_record_id_cache_compact();
+out:
+	mutex_unlock(&erst_record_id_cache.lock);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(erst_clear);
diff --git a/include/acpi/apei.h b/include/acpi/apei.h
index c4dbb132d902..e67b523a50e1 100644
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -30,10 +30,11 @@ int apei_hest_parse(apei_hest_func_t func, void *data);
 
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
-int erst_get_next_record_id(u64 *record_id);
+int erst_get_record_id_begin(int *pos);
+int erst_get_record_id_next(int *pos, u64 *record_id);
+void erst_get_record_id_end(void);
 ssize_t erst_read(u64 record_id, struct cper_record_header *record,
 		  size_t buflen);
-ssize_t erst_read_next(struct cper_record_header *record, size_t buflen);
 int erst_clear(u64 record_id);
 
 #endif
-- 
cgit v1.2.3


From cbb84c4cc1ad0ab8faaffd899ccc9b14a88c91be Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 22 Mar 2011 15:24:54 +0600
Subject: x86, mpparse: Move check_slot into CONFIG_X86_IO_APIC context

When CONFIG_X86_MPPARSE=y and CONFIG_X86_IO_APIC=n, then we get
the following warning:

  arch/x86/kernel/mpparse.c:723: warning: 'check_slot' defined but not used

So, put check_slot into CONFIG_X86_IO_APIC context. Its only
called from CONFIG_X86_IO_APIC=y context.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
LKML-Reference: <AANLkTinsUfGc=NG_GeH_B+zFVu+DXJzZbJKdQLscqfuH@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 6f789a887c06..5a532ce646bf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
 		*nr_m_spare += 1;
 	}
 }
-#else /* CONFIG_X86_IO_APIC */
-static
-inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
-#endif /* CONFIG_X86_IO_APIC */
 
 static int
 check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
@@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 
 	return ret;
 }
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
 					unsigned long mpc_new_phys,
-- 
cgit v1.2.3


From d404ab0a1133e95557bb7deab2a49b348dfeba85 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 22 Mar 2011 16:34:04 -0700
Subject: move x86 specific oops=panic to generic code

The oops=panic cmdline option is not x86 specific, move it to generic code.
Update documentation.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt       |  5 +++++
 Documentation/x86/x86_64/boot-options.txt |  5 -----
 arch/x86/kernel/dumpstack.c               | 10 ----------
 kernel/panic.c                            | 10 ++++++++++
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d18a9e12152a..a3b688287a8d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1825,6 +1825,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 				perfmon on Intel CPUs instead of the
 				CPU specific event set.
 
+	oops=panic	Always panic on oopses. Default is to just kill the process,
+			but there is a small probability of deadlocking the machine.
+			This will also cause panics on machine check exceptions.
+			Useful together with panic=30 to trigger a reboot.
+
 	OSS		[HW,OSS]
 			See Documentation/sound/oss/oss-parameters.txt
 
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 48c13b8ab90c..092e596a1301 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -293,11 +293,6 @@ IOMMU (input/output memory management unit)
 
 Debugging
 
-  oops=panic	Always panic on oopses. Default is to just kill the process,
-		but there is a small probability of deadlocking the machine.
-		This will also cause panics on machine check exceptions.
-		Useful together with panic=30 to trigger a reboot.
-
   kstack=N	Print N words from the kernel stack in oops dumps.
 
   pagefaulttrace  Dump all page faults. Only useful for extreme debugging
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 999e2793590b..81ac6c78c01c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -322,16 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err)
 	oops_end(flags, regs, sig);
 }
 
-static int __init oops_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	if (!strcmp(s, "panic"))
-		panic_on_oops = 1;
-	return 0;
-}
-early_param("oops", oops_setup);
-
 static int __init kstack_setup(char *s)
 {
 	if (!s)
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+
+static int __init oops_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "panic"))
+		panic_on_oops = 1;
+	return 0;
+}
+early_param("oops", oops_setup);
-- 
cgit v1.2.3


From 4061d68e1a59ffacdf216a55f254cb01d4d1f840 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 22 Mar 2011 16:34:57 -0700
Subject: x86: only compile 8237A if CONFIG_ISA_DMA_API is enabled

8237A utilizes the interface provided by CONFIG_ISA_DMA_API, specifically
claim_dma_lock() and release_dma_lock().  Thus, there's a strict
dependency on the config option and the module should only be loaded if
the kernel supports ISA-style DMA.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 743642f1a36c..7338ef2218bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
-obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
+obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
@@ -55,6 +55,7 @@ obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
 obj-$(CONFIG_INTEL_TXT)		+= tboot.o
+obj-$(CONFIG_ISA_DMA_API)	+= i8237.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
-- 
cgit v1.2.3


From 375906f8765e131a4a159b1ffebf78c15db7b3bf Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:14 -0400
Subject: x86: mark associated mm when running a task in 32 bit compatibility
 mode

This patch simply follows the same practice as for setting the TIF_IA32 flag.
In particular, an mm is marked as holding 32-bit tasks when a 32-bit binary is
exec'ed.  Both ELF and a.out formats are updated.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/ia32/ia32_aout.c    | 1 +
 arch/x86/kernel/process_64.c | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2d93bdbc9ac0..fd843877e841 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	/* OK, This is the point of no return */
 	set_personality(PER_LINUX);
 	set_thread_flag(TIF_IA32);
+	current->mm->context.ia32_compat = 1;
 
 	setup_new_exec(bprm);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bd387e8f73b4..6c9dd922ac0d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -501,6 +501,10 @@ void set_personality_64bit(void)
 	/* Make sure to be in 64bit mode */
 	clear_thread_flag(TIF_IA32);
 
+	/* Ensure the corresponding mm is not marked. */
+	if (current->mm)
+		current->mm->context.ia32_compat = 0;
+
 	/* TBD: overwrites user setup. Should have two bits.
 	   But 64bit processes have always behaved this way,
 	   so it's not too bad. The main problem is just that
@@ -516,6 +520,10 @@ void set_personality_ia32(void)
 	set_thread_flag(TIF_IA32);
 	current->personality |= force_personality32;
 
+	/* Mark the associated mm as containing 32-bit tasks. */
+	if (current->mm)
+		current->mm->context.ia32_compat = 1;
+
 	/* Prepare the first "return" to user space */
 	current_thread_info()->status |= TS_COMPAT;
 }
-- 
cgit v1.2.3


From f3c6ea1b06c71b43f751b36bd99345369fe911af Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 23 Mar 2011 22:15:54 +0100
Subject: x86: Use syscore_ops instead of sysdev classes and sysdevs

Some subsystems in the x86 tree need to carry out suspend/resume and
shutdown operations with one CPU on-line and interrupts disabled and
they define sysdev classes and sysdevs or sysdev drivers for this
purpose.  This leads to unnecessarily complicated code and excessive
memory usage, so switch them to using struct syscore_ops objects for
this purpose instead.

Generally, there are three categories of subsystems that use
sysdevs for implementing PM operations: (1) subsystems whose
suspend/resume callbacks ignore their arguments entirely (the
majority), (2) subsystems whose suspend/resume callbacks use their
struct sys_device argument, but don't really need to do that,
because they can be implemented differently in an arguably simpler
way (io_apic.c), and (3) subsystems whose suspend/resume callbacks
use their struct sys_device argument, but the value of that argument
is always the same and could be ignored (microcode_core.c).  In all
of these cases the subsystems in question may be readily converted to
using struct syscore_ops objects for power management and shutdown.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 26 +++--------
 arch/x86/kernel/apic/apic.c      | 33 ++++----------
 arch/x86/kernel/apic/io_apic.c   | 97 +++++++++++++++++++---------------------
 arch/x86/kernel/cpu/mcheck/mce.c | 21 +++++----
 arch/x86/kernel/cpu/mtrr/main.c  | 10 ++---
 arch/x86/kernel/i8237.c          | 30 +++----------
 arch/x86/kernel/i8259.c          | 33 +++++---------
 arch/x86/kernel/microcode_core.c | 34 ++++++--------
 arch/x86/kernel/pci-gart_64.c    | 32 +++----------
 arch/x86/oprofile/nmi_int.c      | 44 +++++-------------
 10 files changed, 128 insertions(+), 232 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6e11c8134158..246d727b65b7 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -21,7 +21,7 @@
 #include <linux/acpi.h>
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/interrupt.h>
 #include <linux/msi.h>
 #include <asm/pci-direct.h>
@@ -1260,7 +1260,7 @@ static void disable_iommus(void)
  * disable suspend until real resume implemented
  */
 
-static int amd_iommu_resume(struct sys_device *dev)
+static void amd_iommu_resume(void)
 {
 	struct amd_iommu *iommu;
 
@@ -1276,11 +1276,9 @@ static int amd_iommu_resume(struct sys_device *dev)
 	 */
 	amd_iommu_flush_all_devices();
 	amd_iommu_flush_all_domains();
-
-	return 0;
 }
 
-static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+static int amd_iommu_suspend(void)
 {
 	/* disable IOMMUs to go out of the way for BIOS */
 	disable_iommus();
@@ -1288,17 +1286,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
 	return 0;
 }
 
-static struct sysdev_class amd_iommu_sysdev_class = {
-	.name = "amd_iommu",
+static struct syscore_ops amd_iommu_syscore_ops = {
 	.suspend = amd_iommu_suspend,
 	.resume = amd_iommu_resume,
 };
 
-static struct sys_device device_amd_iommu = {
-	.id = 0,
-	.cls = &amd_iommu_sysdev_class,
-};
-
 /*
  * This is the core init function for AMD IOMMU hardware in the system.
  * This function is called from the generic x86 DMA layer initialization
@@ -1415,14 +1407,6 @@ static int __init amd_iommu_init(void)
 		goto free;
 	}
 
-	ret = sysdev_class_register(&amd_iommu_sysdev_class);
-	if (ret)
-		goto free;
-
-	ret = sysdev_register(&device_amd_iommu);
-	if (ret)
-		goto free;
-
 	ret = amd_iommu_init_devices();
 	if (ret)
 		goto free;
@@ -1441,6 +1425,8 @@ static int __init amd_iommu_init(void)
 
 	amd_iommu_init_notifier();
 
+	register_syscore_ops(&amd_iommu_syscore_ops);
+
 	if (iommu_pass_through)
 		goto out;
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 966673f44141..fabf01eff771 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -24,7 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/timex.h>
 #include <linux/dmar.h>
@@ -2046,7 +2046,7 @@ static struct {
 	unsigned int apic_thmr;
 } apic_pm_state;
 
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+static int lapic_suspend(void)
 {
 	unsigned long flags;
 	int maxlvt;
@@ -2084,23 +2084,21 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 	return 0;
 }
 
-static int lapic_resume(struct sys_device *dev)
+static void lapic_resume(void)
 {
 	unsigned int l, h;
 	unsigned long flags;
-	int maxlvt;
-	int ret = 0;
+	int maxlvt, ret;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
 	if (!apic_pm_state.active)
-		return 0;
+		return;
 
 	local_irq_save(flags);
 	if (intr_remapping_enabled) {
 		ioapic_entries = alloc_ioapic_entries();
 		if (!ioapic_entries) {
 			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
-			ret = -ENOMEM;
 			goto restore;
 		}
 
@@ -2162,8 +2160,6 @@ static int lapic_resume(struct sys_device *dev)
 	}
 restore:
 	local_irq_restore(flags);
-
-	return ret;
 }
 
 /*
@@ -2171,17 +2167,11 @@ restore:
  * are needed on every CPU up until machine_halt/restart/poweroff.
  */
 
-static struct sysdev_class lapic_sysclass = {
-	.name		= "lapic",
+static struct syscore_ops lapic_syscore_ops = {
 	.resume		= lapic_resume,
 	.suspend	= lapic_suspend,
 };
 
-static struct sys_device device_lapic = {
-	.id	= 0,
-	.cls	= &lapic_sysclass,
-};
-
 static void __cpuinit apic_pm_activate(void)
 {
 	apic_pm_state.active = 1;
@@ -2189,16 +2179,11 @@ static void __cpuinit apic_pm_activate(void)
 
 static int __init init_lapic_sysfs(void)
 {
-	int error;
-
-	if (!cpu_has_apic)
-		return 0;
 	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+	if (cpu_has_apic)
+		register_syscore_ops(&lapic_syscore_ops);
 
-	error = sysdev_class_register(&lapic_sysclass);
-	if (!error)
-		error = sysdev_register(&device_lapic);
-	return error;
+	return 0;
 }
 
 /* local apic needs to resume before other devices access its registers. */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 180ca240e03c..68df09bba92e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -30,7 +30,7 @@
 #include <linux/compiler.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/freezer.h>
@@ -2918,89 +2918,84 @@ static int __init io_apic_bug_finalize(void)
 
 late_initcall(io_apic_bug_finalize);
 
-struct sysfs_ioapic_data {
-	struct sys_device dev;
-	struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
 
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+static void suspend_ioapic(int ioapic_id)
 {
-	struct IO_APIC_route_entry *entry;
-	struct sysfs_ioapic_data *data;
+	struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
 	int i;
 
-	data = container_of(dev, struct sysfs_ioapic_data, dev);
-	entry = data->entry;
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
-		*entry = ioapic_read_entry(dev->id, i);
+	if (!saved_data)
+		return;
+
+	for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
+		saved_data[i] = ioapic_read_entry(ioapic_id, i);
+}
+
+static int ioapic_suspend(void)
+{
+	int ioapic_id;
+
+	for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
+		suspend_ioapic(ioapic_id);
 
 	return 0;
 }
 
-static int ioapic_resume(struct sys_device *dev)
+static void resume_ioapic(int ioapic_id)
 {
-	struct IO_APIC_route_entry *entry;
-	struct sysfs_ioapic_data *data;
+	struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
 	int i;
 
-	data = container_of(dev, struct sysfs_ioapic_data, dev);
-	entry = data->entry;
+	if (!saved_data)
+		return;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].apicid;
-		io_apic_write(dev->id, 0, reg_00.raw);
+	reg_00.raw = io_apic_read(ioapic_id, 0);
+	if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
+		reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
+		io_apic_write(ioapic_id, 0, reg_00.raw);
 	}
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
-		ioapic_write_entry(dev->id, i, entry[i]);
+	for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
+		ioapic_write_entry(ioapic_id, i, saved_data[i]);
+}
 
-	return 0;
+static void ioapic_resume(void)
+{
+	int ioapic_id;
+
+	for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
+		resume_ioapic(ioapic_id);
 }
 
-static struct sysdev_class ioapic_sysdev_class = {
-	.name = "ioapic",
+static struct syscore_ops ioapic_syscore_ops = {
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
 
-static int __init ioapic_init_sysfs(void)
+static int __init ioapic_init_ops(void)
 {
-	struct sys_device * dev;
-	int i, size, error;
+	int i;
 
-	error = sysdev_class_register(&ioapic_sysdev_class);
-	if (error)
-		return error;
+	for (i = 0; i < nr_ioapics; i++) {
+		unsigned int size;
 
-	for (i = 0; i < nr_ioapics; i++ ) {
-		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+		size = nr_ioapic_registers[i]
 			* sizeof(struct IO_APIC_route_entry);
-		mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
-		if (!mp_ioapic_data[i]) {
-			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-			continue;
-		}
-		dev = &mp_ioapic_data[i]->dev;
-		dev->id = i;
-		dev->cls = &ioapic_sysdev_class;
-		error = sysdev_register(dev);
-		if (error) {
-			kfree(mp_ioapic_data[i]);
-			mp_ioapic_data[i] = NULL;
-			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-			continue;
-		}
+		ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
+		if (!ioapic_saved_data[i])
+			pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
 	}
 
+	register_syscore_ops(&ioapic_syscore_ops);
+
 	return 0;
 }
 
-device_initcall(ioapic_init_sysfs);
+device_initcall(ioapic_init_ops);
 
 /*
  * Dynamic irq allocate and deallocation
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ab1122998dba..5a05ef63eb4a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/string.h>
 #include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -1749,14 +1750,14 @@ static int mce_disable_error_reporting(void)
 	return 0;
 }
 
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
+static int mce_suspend(void)
 {
 	return mce_disable_error_reporting();
 }
 
-static int mce_shutdown(struct sys_device *dev)
+static void mce_shutdown(void)
 {
-	return mce_disable_error_reporting();
+	mce_disable_error_reporting();
 }
 
 /*
@@ -1764,14 +1765,18 @@ static int mce_shutdown(struct sys_device *dev)
  * Only one CPU is active at this time, the others get re-added later using
  * CPU hotplug:
  */
-static int mce_resume(struct sys_device *dev)
+static void mce_resume(void)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
-
-	return 0;
 }
 
+static struct syscore_ops mce_syscore_ops = {
+	.suspend	= mce_suspend,
+	.shutdown	= mce_shutdown,
+	.resume		= mce_resume,
+};
+
 static void mce_cpu_restart(void *data)
 {
 	del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1808,9 +1813,6 @@ static void mce_enable_ce(void *all)
 }
 
 static struct sysdev_class mce_sysclass = {
-	.suspend	= mce_suspend,
-	.shutdown	= mce_shutdown,
-	.resume		= mce_resume,
 	.name		= "machinecheck",
 };
 
@@ -2139,6 +2141,7 @@ static __init int mcheck_init_device(void)
 			return err;
 	}
 
+	register_syscore_ops(&mce_syscore_ops);
 	register_hotcpu_notifier(&mce_cpu_notifier);
 	misc_register(&mce_log_device);
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index bebabec5b448..307dfbbf4a8e 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -45,6 +45,7 @@
 #include <linux/cpu.h>
 #include <linux/pci.h>
 #include <linux/smp.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/processor.h>
 #include <asm/e820.h>
@@ -630,7 +631,7 @@ struct mtrr_value {
 
 static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
 
-static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
+static int mtrr_save(void)
 {
 	int i;
 
@@ -642,7 +643,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
 	return 0;
 }
 
-static int mtrr_restore(struct sys_device *sysdev)
+static void mtrr_restore(void)
 {
 	int i;
 
@@ -653,12 +654,11 @@ static int mtrr_restore(struct sys_device *sysdev)
 				    mtrr_value[i].ltype);
 		}
 	}
-	return 0;
 }
 
 
-static struct sysdev_driver mtrr_sysdev_driver = {
+static struct syscore_ops mtrr_syscore_ops = {
 	.suspend	= mtrr_save,
 	.resume		= mtrr_restore,
 };
@@ -839,7 +839,7 @@ static int __init mtrr_init_finialize(void)
 	 * TBD: is there any system with such CPU which supports
 	 * suspend/resume? If no, we should remove the code.
 	 */
-	sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver);
+	register_syscore_ops(&mtrr_syscore_ops);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..8eeaa81de066 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -10,7 +10,7 @@
  */
 
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/dma.h>
 
@@ -21,7 +21,7 @@
  * in asm/dma.h.
  */
 
-static int i8237A_resume(struct sys_device *dev)
+static void i8237A_resume(void)
 {
 	unsigned long flags;
 	int i;
@@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev)
 	enable_dma(4);
 
 	release_dma_lock(flags);
-
-	return 0;
 }
 
-static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
-{
-	return 0;
-}
-
-static struct sysdev_class i8237_sysdev_class = {
-	.name		= "i8237",
-	.suspend	= i8237A_suspend,
+static struct syscore_ops i8237_syscore_ops = {
 	.resume		= i8237A_resume,
 };
 
-static struct sys_device device_i8237A = {
-	.id		= 0,
-	.cls		= &i8237_sysdev_class,
-};
-
-static int __init i8237A_init_sysfs(void)
+static int __init i8237A_init_ops(void)
 {
-	int error = sysdev_class_register(&i8237_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8237A);
-	return error;
+	register_syscore_ops(&i8237_syscore_ops);
+	return 0;
 }
-device_initcall(i8237A_init_sysfs);
+device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index d9ca749c123b..65b8f5c2eebf 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -8,7 +8,7 @@
 #include <linux/random.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -245,20 +245,19 @@ static void save_ELCR(char *trigger)
 	trigger[1] = inb(0x4d1) & 0xDE;
 }
 
-static int i8259A_resume(struct sys_device *dev)
+static void i8259A_resume(void)
 {
 	init_8259A(i8259A_auto_eoi);
 	restore_ELCR(irq_trigger);
-	return 0;
 }
 
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+static int i8259A_suspend(void)
 {
 	save_ELCR(irq_trigger);
 	return 0;
 }
 
-static int i8259A_shutdown(struct sys_device *dev)
+static void i8259A_shutdown(void)
 {
 	/* Put the i8259A into a quiescent state that
 	 * the kernel initialization code can get it
@@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev)
 	 */
 	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
 	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
-	return 0;
 }
 
-static struct sysdev_class i8259_sysdev_class = {
-	.name = "i8259",
+static struct syscore_ops i8259_syscore_ops = {
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
 };
 
-static struct sys_device device_i8259A = {
-	.id	= 0,
-	.cls	= &i8259_sysdev_class,
-};
-
 static void mask_8259A(void)
 {
 	unsigned long flags;
@@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = {
 
 struct legacy_pic *legacy_pic = &default_legacy_pic;
 
-static int __init i8259A_init_sysfs(void)
+static int __init i8259A_init_ops(void)
 {
-	int error;
-
-	if (legacy_pic != &default_legacy_pic)
-		return 0;
+	if (legacy_pic == &default_legacy_pic)
+		register_syscore_ops(&i8259_syscore_ops);
 
-	error = sysdev_class_register(&i8259_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8259A);
-	return error;
+	return 0;
 }
 
-device_initcall(i8259A_init_sysfs);
+device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 87af68e0e1e1..5ed0ab549eb8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -82,6 +82,7 @@
 #include <linux/cpu.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -438,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
 	return 0;
 }
 
-static int mc_sysdev_resume(struct sys_device *dev)
+static struct sysdev_driver mc_sysdev_driver = {
+	.add			= mc_sysdev_add,
+	.remove			= mc_sysdev_remove,
+};
+
+/**
+ * mc_bp_resume - Update boot CPU microcode during resume.
+ */
+static void mc_bp_resume(void)
 {
-	int cpu = dev->id;
+	int cpu = smp_processor_id();
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-	if (!cpu_online(cpu))
-		return 0;
-
-	/*
-	 * All non-bootup cpus are still disabled,
-	 * so only CPU 0 will apply ucode here.
-	 *
-	 * Moreover, there can be no concurrent
-	 * updates from any other places at this point.
-	 */
-	WARN_ON(cpu != 0);
-
 	if (uci->valid && uci->mc)
 		microcode_ops->apply_microcode(cpu);
-
-	return 0;
 }
 
-static struct sysdev_driver mc_sysdev_driver = {
-	.add			= mc_sysdev_add,
-	.remove			= mc_sysdev_remove,
-	.resume			= mc_sysdev_resume,
+static struct syscore_ops mc_syscore_ops = {
+	.resume			= mc_bp_resume,
 };
 
 static __cpuinit int
@@ -542,6 +535,7 @@ static int __init microcode_init(void)
 	if (error)
 		return error;
 
+	register_syscore_ops(&mc_syscore_ops);
 	register_hotcpu_notifier(&mc_cpu_notifier);
 
 	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c01ffa5b9b87..82ada01625b9 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,7 +27,7 @@
 #include <linux/kdebug.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/io.h>
 #include <linux/gfp.h>
 #include <asm/atomic.h>
@@ -589,7 +589,7 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
 	aperture_alloc = aper_alloc;
 }
 
-static void gart_fixup_northbridges(struct sys_device *dev)
+static void gart_fixup_northbridges(void)
 {
 	int i;
 
@@ -613,33 +613,20 @@ static void gart_fixup_northbridges(struct sys_device *dev)
 	}
 }
 
-static int gart_resume(struct sys_device *dev)
+static void gart_resume(void)
 {
 	pr_info("PCI-DMA: Resuming GART IOMMU\n");
 
-	gart_fixup_northbridges(dev);
+	gart_fixup_northbridges();
 
 	enable_gart_translations();
-
-	return 0;
 }
 
-static int gart_suspend(struct sys_device *dev, pm_message_t state)
-{
-	return 0;
-}
-
-static struct sysdev_class gart_sysdev_class = {
-	.name		= "gart",
-	.suspend	= gart_suspend,
+static struct syscore_ops gart_syscore_ops = {
 	.resume		= gart_resume,
 
 };
 
-static struct sys_device device_gart = {
-	.cls		= &gart_sysdev_class,
-};
-
 /*
  * Private Northbridge GATT initialization in case we cannot use the
  * AGP driver for some reason.
@@ -650,7 +637,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 	unsigned aper_base, new_aper_base;
 	struct pci_dev *dev;
 	void *gatt;
-	int i, error;
+	int i;
 
 	pr_info("PCI-DMA: Disabling AGP.\n");
 
@@ -685,12 +672,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 
 	agp_gatt_table = gatt;
 
-	error = sysdev_class_register(&gart_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_gart);
-	if (error)
-		panic("Could not register gart_sysdev -- "
-		      "would corrupt data on next suspend");
+	register_syscore_ops(&gart_syscore_ops);
 
 	flush_gart();
 
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index e2b7b0c06cdf..8dace181c88e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -15,7 +15,7 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/oprofile.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
 #include <linux/kdebug.h>
@@ -536,7 +536,7 @@ static void nmi_shutdown(void)
 
 #ifdef CONFIG_PM
 
-static int nmi_suspend(struct sys_device *dev, pm_message_t state)
+static int nmi_suspend(void)
 {
 	/* Only one CPU left, just stop that one */
 	if (nmi_enabled == 1)
@@ -544,49 +544,31 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
 	return 0;
 }
 
-static int nmi_resume(struct sys_device *dev)
+static void nmi_resume(void)
 {
 	if (nmi_enabled == 1)
 		nmi_cpu_start(NULL);
-	return 0;
 }
 
-static struct sysdev_class oprofile_sysclass = {
-	.name		= "oprofile",
+static struct syscore_ops oprofile_syscore_ops = {
 	.resume		= nmi_resume,
 	.suspend	= nmi_suspend,
 };
 
-static struct sys_device device_oprofile = {
-	.id	= 0,
-	.cls	= &oprofile_sysclass,
-};
-
-static int __init init_sysfs(void)
+static void __init init_suspend_resume(void)
 {
-	int error;
-
-	error = sysdev_class_register(&oprofile_sysclass);
-	if (error)
-		return error;
-
-	error = sysdev_register(&device_oprofile);
-	if (error)
-		sysdev_class_unregister(&oprofile_sysclass);
-
-	return error;
+	register_syscore_ops(&oprofile_syscore_ops);
 }
 
-static void exit_sysfs(void)
+static void exit_suspend_resume(void)
 {
-	sysdev_unregister(&device_oprofile);
-	sysdev_class_unregister(&oprofile_sysclass);
+	unregister_syscore_ops(&oprofile_syscore_ops);
 }
 
 #else
 
-static inline int  init_sysfs(void) { return 0; }
-static inline void exit_sysfs(void) { }
+static inline void init_suspend_resume(void) { }
+static inline void exit_suspend_resume(void) { }
 
 #endif /* CONFIG_PM */
 
@@ -789,9 +771,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 
 	mux_init(ops);
 
-	ret = init_sysfs();
-	if (ret)
-		return ret;
+	init_suspend_resume();
 
 	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
 	return 0;
@@ -799,5 +779,5 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 
 void op_nmi_exit(void)
 {
-	exit_sysfs();
+	exit_suspend_resume();
 }
-- 
cgit v1.2.3


From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 23 Mar 2011 16:43:29 -0700
Subject: crash_dump: export is_kdump_kernel to modules, consolidate
 elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn

The Xen PV drivers in a crashed HVM guest can not connect to the dom0
backend drivers because both frontend and backend drivers are still in
connected state.  To run the connection reset function only in case of a
crashdump, the is_kdump_kernel() function needs to be available for the PV
driver modules.

Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into
kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel()
usable for modules.

Leave 'elfcorehdr' as early_param().  This changes powerpc from __setup()
to early_param().  It adds an address range check from x86 also on ia64
and powerpc.

[akpm@linux-foundation.org: additional #includes]
[akpm@linux-foundation.org: remove elfcorehdr_addr export]
[akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes]
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/crash_dump.c     |  3 ---
 arch/arm/kernel/setup.c          | 24 ------------------------
 arch/ia64/kernel/crash_dump.c    |  3 ---
 arch/ia64/kernel/efi.c           |  1 +
 arch/ia64/kernel/setup.c         | 18 ------------------
 arch/powerpc/kernel/crash_dump.c | 17 -----------------
 arch/sh/kernel/crash_dump.c      | 22 ----------------------
 arch/x86/kernel/crash_dump_32.c  |  3 ---
 arch/x86/kernel/crash_dump_64.c  |  3 ---
 arch/x86/kernel/e820.c           |  1 +
 arch/x86/kernel/setup.c          | 22 ----------------------
 include/linux/bootmem.h          |  4 ----
 kernel/Makefile                  |  1 +
 kernel/crash_dump.c              | 34 ++++++++++++++++++++++++++++++++++
 mm/bootmem.c                     |  8 --------
 mm/nobootmem.c                   |  8 --------
 16 files changed, 37 insertions(+), 135 deletions(-)
 create mode 100644 kernel/crash_dump.c

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/crash_dump.c b/arch/arm/kernel/crash_dump.c
index cd3b853a8a6d..90c50d4b43f7 100644
--- a/arch/arm/kernel/crash_dump.c
+++ b/arch/arm/kernel/crash_dump.c
@@ -18,9 +18,6 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 
-/* stores the physical address of elf header of crash image */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 /**
  * copy_oldmem_page() - copy one page from old kernel memory
  * @pfn: page frame number to be copied
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index d1da92174277..c36c1a4250f3 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -788,30 +788,6 @@ static void __init reserve_crashkernel(void)
 static inline void reserve_crashkernel(void) {}
 #endif /* CONFIG_KEXEC */
 
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
-
-#ifdef CONFIG_CRASH_DUMP
-/*
- * elfcorehdr= specifies the location of elf core header stored by the crashed
- * kernel. This option will be passed by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-	char *end;
-
-	if (!arg)
-		return -EINVAL;
-
-	elfcorehdr_addr = memparse(arg, &end);
-	return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif /* CONFIG_CRASH_DUMP */
-
 static void __init squash_mem_tags(struct tag *tag)
 {
 	for (; tag->hdr.size; tag = tag_next(tag))
diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
index 23e91290e41f..c8c9298666fb 100644
--- a/arch/ia64/kernel/crash_dump.c
+++ b/arch/ia64/kernel/crash_dump.c
@@ -13,9 +13,6 @@
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index a0f001928502..6fc03aff046c 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -23,6 +23,7 @@
  */
 #include <linux/module.h>
 #include <linux/bootmem.h>
+#include <linux/crash_dump.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/types.h>
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 911cf9749700..5e2c72498c51 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -479,25 +479,7 @@ static __init int setup_nomca(char *s)
 }
 early_param("nomca", setup_nomca);
 
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
 #ifdef CONFIG_CRASH_DUMP
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-        elfcorehdr_addr = memparse(arg, &arg);
-	return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-
 int __init reserve_elfcorehdr(u64 *start, u64 *end)
 {
 	u64 length;
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 0a2af50243cb..424afb6b8fba 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -28,9 +28,6 @@
 #define DBG(fmt...)
 #endif
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 #ifndef CONFIG_RELOCATABLE
 void __init reserve_kdump_trampoline(void)
 {
@@ -72,20 +69,6 @@ void __init setup_kdump_trampoline(void)
 }
 #endif /* CONFIG_RELOCATABLE */
 
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
-static int __init parse_elfcorehdr(char *p)
-{
-	if (p)
-		elfcorehdr_addr = memparse(p, &p);
-
-	return 1;
-}
-__setup("elfcorehdr=", parse_elfcorehdr);
-
 static int __init parse_savemaxmem(char *p)
 {
 	if (p)
diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c
index 37c97d444576..569e7b171c01 100644
--- a/arch/sh/kernel/crash_dump.c
+++ b/arch/sh/kernel/crash_dump.c
@@ -9,28 +9,6 @@
 #include <linux/io.h>
 #include <asm/uaccess.h>
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- *
- * elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	elfcorehdr_addr = memparse(arg, &arg);
-
-	return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index d5cd13945d5a..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
 
 static void *kdump_buf_page;
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 static inline bool is_crashed_pfn_valid(unsigned long pfn)
 {
 #ifndef CONFIG_X86_PAE
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 994828899e09..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index cdf5bfd9d4d5..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/crash_dump.h>
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
 #include <linux/suspend.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 32bd87cbf982..5a0484a95ad6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -619,28 +619,6 @@ void __init reserve_standard_io_resources(void)
 
 }
 
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
-
-#ifdef CONFIG_CRASH_DUMP
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-	char *end;
-	if (!arg)
-		return -EINVAL;
-	elfcorehdr_addr = memparse(arg, &end);
-	return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
 static __init void reserve_ibft_region(void)
 {
 	unsigned long addr, size = 0;
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 499dfe982a0e..b8613e806aa9 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -19,10 +19,6 @@ extern unsigned long min_low_pfn;
  */
 extern unsigned long max_pfn;
 
-#ifdef CONFIG_CRASH_DUMP
-extern unsigned long saved_max_pfn;
-#endif
-
 #ifndef CONFIG_NO_BOOTMEM
 /*
  * node_bootmem_map is a map pointer - the bits represent all physical 
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 07aeb89e396e..01d5a4b3dd0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -34,14 +34,6 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
-
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 
 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index e2bdb07079ce..e99f6cd1da1f 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,14 +32,6 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
-
 static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 					u64 goal, u64 limit)
 {
-- 
cgit v1.2.3


From 71f9e59800e5ad4e6b683348424c9fe54306cd43 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 24 Mar 2011 11:42:30 +0900
Subject: x86, dumpstack: Use %pB format specifier for stack trace

Improve noreturn function entries in call traces:

Before:

 Call Trace:
  [<ffffffff812a8502>] panic+0x8c/0x18d
  [<ffffffffa000012a>] deep01+0x0/0x38 [test_panic]  <--- bad
  [<ffffffff81104666>] proc_file_write+0x73/0x8d
  [<ffffffff811000b3>] proc_reg_write+0x8d/0xac
  [<ffffffff810c7d32>] vfs_write+0xa1/0xc5
  [<ffffffff810c7e0f>] sys_write+0x45/0x6c
  [<ffffffff8f02943b>] system_call_fastpath+0x16/0x1b

After:

 Call Trace:
  [<ffffffff812bce69>] panic+0x8c/0x18d
  [<ffffffffa000012a>] panic_write+0x20/0x20 [test_panic] <--- good
  [<ffffffff81115fab>] proc_file_write+0x73/0x8d
  [<ffffffff81111a5f>] proc_reg_write+0x8d/0xac
  [<ffffffff810d90ee>] vfs_write+0xa1/0xc5
  [<ffffffff810d91cb>] sys_write+0x45/0x6c
  [<ffffffff812c07fb>] system_call_fastpath+0x16/0x1b

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1300934550-21394-2-git-send-email-namhyung@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 999e2793590b..24d0479025f9 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
 
 void printk_address(unsigned long address, int reliable)
 {
-	printk(" [<%p>] %s%pS\n", (void *) address,
+	printk(" [<%p>] %s%pB\n", (void *) address,
 			reliable ? "" : "? ", (void *) address);
 }
 
-- 
cgit v1.2.3


From 242214f9c1eeaae40eca11e3b4d37bfce960a7cd Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 24 Mar 2011 23:36:25 +0300
Subject: perf, x86: P4 PMU - Read proper MSR register to catch unflagged
 overflows

The read of a proper MSR register was missed and instead of
counter the configration register was tested (it has
ARCH_P4_UNFLAGGED_BIT always cleared) leading to unknown NMI
hitting the system. As result the user may obtain "Dazed and
confused, but trying to continue" message. Fix it by reading a
proper MSR register.

When an NMI happens on a P4, the perf nmi handler checks the
configuration register to see if the overflow bit is set or not
before taking appropriate action.  Unfortunately, various P4
machines had a broken overflow bit, so a backup mechanism was
implemented.  This mechanism checked to see if the counter
rolled over or not.

A previous commit that implemented this backup mechanism was
broken. Instead of reading the counter register, it used the
configuration register to determine if the counter rolled over
or not. Reading that bit would give incorrect results.

This would lead to 'Dazed and confused' messages for the end
user when using the perf tool (or if the nmi watchdog is
running).

The fix is to read the counter register before determining if
the counter rolled over or not.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <4D8BAB49.3080701@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3769ac822f96..d3d7b59841e5 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -777,6 +777,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 	 * the counter has reached zero value and continued counting before
 	 * real NMI signal was received:
 	 */
+	rdmsrl(hwc->event_base, v);
 	if (!(v & ARCH_P4_UNFLAGGED_BIT))
 		return 1;
 
-- 
cgit v1.2.3


From 00a30b254b88d2d4f5af00835a9b7f70326def9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2011 22:53:10 +0100
Subject: x86: DT: Fix return condition in irq_create_of_mapping()

The xlate() function returns 0 or a negative error code. Returning the
error code blindly will be seen as an huge irq number by the calling
function because irq_create_of_mapping() returns an unsigned value.

Return 0 (NO_IRQ) as required.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/kernel/devicetree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 7a8cebc9ff29..9c91badb6ca9 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -65,7 +65,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 		return 0;
 	ret = ih->xlate(ih, intspec, intsize, &virq, &type);
 	if (ret)
-		return ret;
+		return 0;
 	if (type == IRQ_TYPE_NONE)
 		return virq;
 	/* set the mask if it is different from current */
-- 
cgit v1.2.3


From 07611dbda5ccbd9a628f29686d62bafdd007db7b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2011 21:41:57 +0100
Subject: x86: DT: Cleanup namespace and call irq_set_irq_type() unconditional

That call escaped the name space cleanup. Fix it up.

We really want to call there. The chip might have changed since the
irq was setup initially. So let the core code and the chip decide what
to do. The status is just an unreliable snapshot.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/kernel/devicetree.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 9c91badb6ca9..706a9fb46a58 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -68,9 +68,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 		return 0;
 	if (type == IRQ_TYPE_NONE)
 		return virq;
-	/* set the mask if it is different from current */
-	if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
-		set_irq_type(virq, type);
+	irq_set_irq_type(virq, type);
 	return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-- 
cgit v1.2.3


From 45daae575e08bbf7405c5a3633e956fa364d1b4f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Mar 2011 10:24:23 +0100
Subject: perf, x86: Complain louder about BIOSen corrupting CPU/PMU state and
 continue

Eric Dumazet reported that hardware PMU events do not work on his
system, due to the BIOS corrupting PMU state:

    Performance Events: PEBS fmt0+, Core2 events, Broken BIOS detected, using software events only.
    [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR 186 is 43003c)

Linus suggested that we continue in the face of such BIOS-induced CPU
state corruption:

   http://lkml.org/lkml/2011/3/24/608

Such BIOSes will have to be fixed - Linux developers rely on a working and
fully capable PMU and the BIOS interfering with the CPU's PMU state is simply
not acceptable.

So this patch changes perf to continue when it detects such BIOS
interaction, some hardware events may be unreliable due to the BIOS
writing and re-writing them - there's not much the kernel can do
about that but to detect the corruption and report it.

Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ec46eea0c4ed..eb00677ee2ae 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -500,12 +500,17 @@ static bool check_hw_exists(void)
 	return true;
 
 bios_fail:
-	printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
+	/*
+	 * We still allow the PMU driver to operate:
+	 */
+	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
 	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
-	return false;
+
+	return true;
 
 msr_fail:
 	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+
 	return false;
 }
 
-- 
cgit v1.2.3


From 21431c2900a0b669080b5bfaae2a7d9d9c026e9b Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 15 Mar 2010 07:28:00 -0500
Subject: kgdb,x86_64: fix compile warning found with sparse

Fix sparse warning:

arch/x86/kernel/kgdb.c:123:9: warning: switch with no cases

Reported-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a4130005028a..3c2fb0f25abd 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -121,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
 		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
 		       dbg_reg_def[regno].size);
 
-	switch (regno) {
 #ifdef CONFIG_X86_32
+	switch (regno) {
 	case GDB_SS:
 		if (!user_mode_vm(regs))
 			*(unsigned long *)mem = __KERNEL_DS;
@@ -135,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
 	case GDB_FS:
 		*(unsigned long *)mem = 0xFFFF;
 		break;
-#endif
 	}
+#endif
 	return dbg_reg_def[regno].name;
 }
 
-- 
cgit v1.2.3