From c86cce2a20207cbf2b3dfe97c985a1f5aa5d3798 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 27 Dec 2011 11:25:47 +0100 Subject: [S390] kvm: fix sleeping function ... at mm/page_alloc.c:2260 commit cc772456ac9b460693492b3a3d89e8c81eda5874 [S390] fix list corruption in gmap reverse mapping added a potential dead lock: BUG: sleeping function called from invalid context at mm/page_alloc.c:2260 in_atomic(): 1, irqs_disabled(): 0, pid: 1108, name: qemu-system-s39 3 locks held by qemu-system-s39/1108: #0: (&kvm->slots_lock){+.+.+.}, at: [<000003e004866542>] kvm_set_memory_region+0x3a/0x6c [kvm] #1: (&mm->mmap_sem){++++++}, at: [<0000000000123790>] gmap_map_segment+0x9c/0x298 #2: (&(&mm->page_table_lock)->rlock){+.+.+.}, at: [<00000000001237a8>] gmap_map_segment+0xb4/0x298 CPU: 0 Not tainted 3.1.3 #45 Process qemu-system-s39 (pid: 1108, task: 00000004f8b3cb30, ksp: 00000004fd5978d0) 00000004fd5979a0 00000004fd597920 0000000000000002 0000000000000000 00000004fd5979c0 00000004fd597938 00000004fd597938 0000000000617e96 0000000000000000 00000004f8b3cf58 0000000000000000 0000000000000000 000000000000000d 000000000000000c 00000004fd597988 0000000000000000 0000000000000000 0000000000100a18 00000004fd597920 00000004fd597960 Call Trace: ([<0000000000100926>] show_trace+0xee/0x144) [<0000000000131f3a>] __might_sleep+0x12a/0x158 [<0000000000217fb4>] __alloc_pages_nodemask+0x224/0xadc [<0000000000123086>] gmap_alloc_table+0x46/0x114 [<000000000012395c>] gmap_map_segment+0x268/0x298 [<000003e00486b014>] kvm_arch_commit_memory_region+0x44/0x6c [kvm] [<000003e004866414>] __kvm_set_memory_region+0x3b0/0x4a4 [kvm] [<000003e004866554>] kvm_set_memory_region+0x4c/0x6c [kvm] [<000003e004867c7a>] kvm_vm_ioctl+0x14a/0x314 [kvm] [<0000000000292100>] do_vfs_ioctl+0x94/0x588 [<0000000000292688>] SyS_ioctl+0x94/0xac [<000000000061e124>] sysc_noemu+0x22/0x28 [<000003fffcd5e7ca>] 0x3fffcd5e7ca 3 locks held by qemu-system-s39/1108: #0: (&kvm->slots_lock){+.+.+.}, at: [<000003e004866542>] kvm_set_memory_region+0x3a/0x6c [kvm] #1: (&mm->mmap_sem){++++++}, at: [<0000000000123790>] gmap_map_segment+0x9c/0x298 #2: (&(&mm->page_table_lock)->rlock){+.+.+.}, at: [<00000000001237a8>] gmap_map_segment+0xb4/0x298 Fix this by freeing the lock on the alloc path. This is ok, since the gmap table is never freed until we call gmap_free, so the table we are walking cannot go. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pgtable.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/s390/mm/pgtable.c') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 301c84d3b542..dc2269f1821c 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -267,7 +267,10 @@ static int gmap_alloc_table(struct gmap *gmap, struct page *page; unsigned long *new; + /* since we dont free the gmap table until gmap_free we can unlock */ + spin_unlock(&gmap->mm->page_table_lock); page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); + spin_lock(&gmap->mm->page_table_lock); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); -- cgit v1.2.3 From 14045ebf1e1156d966a796cacad91028e01797e5 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 27 Dec 2011 11:27:07 +0100 Subject: [S390] add support for physical memory > 4TB The kernel address space of a 64 bit kernel currently uses a three level page table and the vmemmap array has a fixed address and a fixed maximum size. A three level page table is good enough for systems with less than 3.8TB of memory, for bigger systems four page table levels need to be used. Each page table level costs a bit of performance, use 3 levels for normal systems and 4 levels only for the really big systems. To avoid bloating sparse.o too much set MAX_PHYSMEM_BITS to 46 for a maximum of 64TB of memory. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 23 ++------------ arch/s390/include/asm/sparsemem.h | 4 +-- arch/s390/kernel/setup.c | 67 +++++++++++++++++++++++++++++++-------- arch/s390/mm/init.c | 16 ++++++---- arch/s390/mm/pgtable.c | 11 ------- 5 files changed, 68 insertions(+), 53 deletions(-) (limited to 'arch/s390/mm/pgtable.c') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 4f289ff0b7fe..011358c1b18e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -128,28 +128,11 @@ static inline int is_zero_pfn(unsigned long pfn) * effect, this also makes sure that 64 bit module code cannot be used * as system call address. */ - extern unsigned long VMALLOC_START; +extern unsigned long VMALLOC_END; +extern struct page *vmemmap; -#ifndef __s390x__ -#define VMALLOC_SIZE (96UL << 20) -#define VMALLOC_END 0x7e000000UL -#define VMEM_MAP_END 0x80000000UL -#else /* __s390x__ */ -#define VMALLOC_SIZE (128UL << 30) -#define VMALLOC_END 0x3e000000000UL -#define VMEM_MAP_END 0x40000000000UL -#endif /* __s390x__ */ - -/* - * VMEM_MAX_PHYS is the highest physical address that can be added to the 1:1 - * mapping. This needs to be calculated at compile time since the size of the - * VMEM_MAP is static but the size of struct page can change. - */ -#define VMEM_MAX_PAGES ((VMEM_MAP_END - VMALLOC_END) / sizeof(struct page)) -#define VMEM_MAX_PFN min(VMALLOC_START >> PAGE_SHIFT, VMEM_MAX_PAGES) -#define VMEM_MAX_PHYS ((VMEM_MAX_PFN << PAGE_SHIFT) & ~((16 << 20) - 1)) -#define vmemmap ((struct page *) VMALLOC_END) +#define VMEM_MAX_PHYS ((unsigned long) vmemmap) /* * A 31 bit pagetable entry of S390 has following format: diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h index 545d219e6a2d..0fb34027d3f6 100644 --- a/arch/s390/include/asm/sparsemem.h +++ b/arch/s390/include/asm/sparsemem.h @@ -4,8 +4,8 @@ #ifdef CONFIG_64BIT #define SECTION_SIZE_BITS 28 -#define MAX_PHYSADDR_BITS 42 -#define MAX_PHYSMEM_BITS 42 +#define MAX_PHYSADDR_BITS 46 +#define MAX_PHYSMEM_BITS 46 #else diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 26b601c2b137..66903eed36e6 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -94,6 +94,15 @@ struct mem_chunk __initdata memory_chunk[MEMORY_CHUNKS]; int __initdata memory_end_set; unsigned long __initdata memory_end; +unsigned long VMALLOC_START; +EXPORT_SYMBOL(VMALLOC_START); + +unsigned long VMALLOC_END; +EXPORT_SYMBOL(VMALLOC_END); + +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); + /* An array with a pointer to the lowcore of every CPU. */ struct _lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); @@ -277,6 +286,15 @@ static int __init early_parse_mem(char *p) } early_param("mem", early_parse_mem); +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + VMALLOC_END = (memparse(arg, &arg) + PAGE_SIZE - 1) & PAGE_MASK; + return 0; +} +early_param("vmalloc", parse_vmalloc); + unsigned int user_mode = HOME_SPACE_MODE; EXPORT_SYMBOL_GPL(user_mode); @@ -478,8 +496,7 @@ EXPORT_SYMBOL_GPL(real_memory_size); static void __init setup_memory_end(void) { - unsigned long memory_size; - unsigned long max_mem; + unsigned long vmax, vmalloc_size, tmp; int i; @@ -489,12 +506,9 @@ static void __init setup_memory_end(void) memory_end_set = 1; } #endif - memory_size = 0; + real_memory_size = 0; memory_end &= PAGE_MASK; - max_mem = memory_end ? min(VMEM_MAX_PHYS, memory_end) : VMEM_MAX_PHYS; - memory_end = min(max_mem, memory_end); - /* * Make sure all chunks are MAX_ORDER aligned so we don't need the * extra checks that HOLES_IN_ZONE would require. @@ -514,23 +528,48 @@ static void __init setup_memory_end(void) chunk->addr = start; chunk->size = end - start; } + real_memory_size = max(real_memory_size, + chunk->addr + chunk->size); } + /* Choose kernel address space layout: 2, 3, or 4 levels. */ +#ifdef CONFIG_64BIT + vmalloc_size = VMALLOC_END ?: 128UL << 30; + tmp = (memory_end ?: real_memory_size) / PAGE_SIZE; + tmp = tmp * (sizeof(struct page) + PAGE_SIZE) + vmalloc_size; + if (tmp <= (1UL << 42)) + vmax = 1UL << 42; /* 3-level kernel page table */ + else + vmax = 1UL << 53; /* 4-level kernel page table */ +#else + vmalloc_size = VMALLOC_END ?: 96UL << 20; + vmax = 1UL << 31; /* 2-level kernel page table */ +#endif + /* vmalloc area is at the end of the kernel address space. */ + VMALLOC_END = vmax; + VMALLOC_START = vmax - vmalloc_size; + + /* Split remaining virtual space between 1:1 mapping & vmemmap array */ + tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + tmp = VMALLOC_START - tmp * sizeof(struct page); + tmp &= ~((vmax >> 11) - 1); /* align to page table level */ + tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS); + vmemmap = (struct page *) tmp; + + /* Take care that memory_end is set and <= vmemmap */ + memory_end = min(memory_end ?: real_memory_size, tmp); + + /* Fixup memory chunk array to fit into 0..memory_end */ for (i = 0; i < MEMORY_CHUNKS; i++) { struct mem_chunk *chunk = &memory_chunk[i]; - real_memory_size = max(real_memory_size, - chunk->addr + chunk->size); - if (chunk->addr >= max_mem) { + if (chunk->addr >= memory_end) { memset(chunk, 0, sizeof(*chunk)); continue; } - if (chunk->addr + chunk->size > max_mem) - chunk->size = max_mem - chunk->addr; - memory_size = max(memory_size, chunk->addr + chunk->size); + if (chunk->addr + chunk->size > memory_end) + chunk->size = memory_end - chunk->addr; } - if (!memory_end) - memory_end = memory_size; } void *restart_stack __attribute__((__section__(".data"))); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index d4b9fb4d0042..5d633019d8f3 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -93,18 +93,22 @@ static unsigned long setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type; + unsigned long pgd_type, asce_bits; init_mm.pgd = swapper_pg_dir; - S390_lowcore.kernel_asce = __pa(init_mm.pgd) & PAGE_MASK; #ifdef CONFIG_64BIT - /* A three level page table (4TB) is enough for the kernel space. */ - S390_lowcore.kernel_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; + if (VMALLOC_END > (1UL << 42)) { + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION2_ENTRY_EMPTY; + } else { + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION3_ENTRY_EMPTY; + } #else - S390_lowcore.kernel_asce |= _ASCE_TABLE_LENGTH; + asce_bits = _ASCE_TABLE_LENGTH; pgd_type = _SEGMENT_ENTRY_EMPTY; #endif + S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; clear_table((unsigned long *) init_mm.pgd, pgd_type, sizeof(unsigned long)*2048); vmem_map_init(); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index dc2269f1821c..9a4d02f64f16 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -33,17 +33,6 @@ #define FRAG_MASK 0x03 #endif -unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; -EXPORT_SYMBOL(VMALLOC_START); - -static int __init parse_vmalloc(char *arg) -{ - if (!arg) - return -EINVAL; - VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; - return 0; -} -early_param("vmalloc", parse_vmalloc); unsigned long *crst_table_alloc(struct mm_struct *mm) { -- cgit v1.2.3 From 2320c5793790fcda80e6dcc088dbda86040235e5 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 Feb 2012 10:29:21 +0100 Subject: [S390] incorrect PageTables counter for kvm page tables The page_table_free_pgste function is used for kvm processes to free page tables that have the pgste extension. It calls pgtable_page_ctor instead of pgtable_page_dtor which increases NR_PAGETABLE instead of decreasing it. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/mm/pgtable.c') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9a4d02f64f16..51b0738e13d1 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -574,7 +574,7 @@ static inline void page_table_free_pgste(unsigned long *table) page = pfn_to_page(__pa(table) >> PAGE_SHIFT); mp = (struct gmap_pgtable *) page->index; BUG_ON(!list_empty(&mp->mapper)); - pgtable_page_ctor(page); + pgtable_page_dtor(page); atomic_set(&page->_mapcount, -1); kfree(mp); __free_page(page); -- cgit v1.2.3