From fa62aafea9e415cd1efd8c4054106112fe809f19 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:38 -0800 Subject: x86, mm: Add global page_size_mask and probe one time only Now we pass around use_gbpages and use_pse for calculating page table size, Later we will need to call init_memory_mapping for every ram range one by one, that mean those calculation will be done several times. Those information are the same for all ram range and could be stored in page_size_mask and could be probed it one time only. Move that probing code out of init_memory_mapping into separated function probe_page_size_mask(), and call it before all init_memory_mapping. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-2-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 55 +++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 30 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d7aea41563b3..aa5b0da03f6d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -35,6 +35,7 @@ struct map_range { unsigned page_size_mask; }; +static int page_size_mask; /* * First calculate space needed for kernel direct mapping page tables to cover * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB @@ -94,6 +95,30 @@ static void __init find_early_table_space(struct map_range *mr, int nr_range) (pgt_buf_top << PAGE_SHIFT) - 1); } +void probe_page_size_mask(void) +{ +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } +} void __init native_pagetable_reserve(u64 start, u64 end) { memblock_reserve(start, end - start); @@ -129,45 +154,15 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long ret = 0; unsigned long pos; - struct map_range mr[NR_RANGE_MR]; int nr_range, i; - int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", start, end - 1); -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - use_pse = use_gbpages = 0; -#else - use_pse = cpu_has_pse; - use_gbpages = direct_gbpages; -#endif - - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } - - if (use_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; - if (use_pse) - page_size_mask |= 1 << PG_LEVEL_2M; - memset(mr, 0, sizeof(mr)); nr_range = 0; -- cgit v1.2.3 From 4e33e06555329e93523b3d2590b9210bf84120a3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:39 -0800 Subject: x86, mm: Split out split_mem_range from init_memory_mapping So make init_memory_mapping smaller and readable. -v2: use 0 instead of nr_range as input parameter found by Yasuaki Ishimatsu. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-3-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Cc: Yasuaki Ishimatsu Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index aa5b0da03f6d..6368b86b84e2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,25 +146,13 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) { unsigned long start_pfn, end_pfn; - unsigned long ret = 0; unsigned long pos; - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - - printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", - start, end - 1); - - memset(mr, 0, sizeof(mr)); - nr_range = 0; + int i; /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; @@ -258,6 +246,27 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Fri, 16 Nov 2012 19:38:40 -0800 Subject: x86, mm: Move down find_early_table_space() It will need to call split_mem_range(). Move it down after that to avoid extra declaration. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-4-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 117 +++++++++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 58 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6368b86b84e2..701abbc24735 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -36,64 +36,6 @@ struct map_range { }; static int page_size_mask; -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; - phys_addr_t base; - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - good_end = max_pfn_mapped << PAGE_SHIFT; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); -} void probe_page_size_mask(void) { @@ -249,6 +191,65 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) +{ + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; + phys_addr_t base; + + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; + + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; + + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } + + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + good_end = max_pfn_mapped << PAGE_SHIFT; + + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); + if (!base) + panic("Cannot find space for the kernel page tables"); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_top << PAGE_SHIFT) - 1); +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from -- cgit v1.2.3 From 22ddfcaa0dbae992332381d41b8a1fbc72269a13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:41 -0800 Subject: x86, mm: Move init_memory_mapping calling out of setup.c Now init_memory_mapping is called two times, later will be called for every ram ranges. Could put all related init_mem calling together and out of setup.c. Actually, it reverts commit 1bbbbe7 x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct mapping. will address that later with complete solution include handling hole under 4g. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-5-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 1 - arch/x86/include/asm/pgtable.h | 2 +- arch/x86/kernel/setup.c | 27 +-------------------------- arch/x86/mm/init.c | 19 ++++++++++++++++++- 4 files changed, 20 insertions(+), 29 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index adcc0ae73d09..4f13998be59a 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,7 +12,6 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); - extern unsigned long __initdata pgt_buf_start; extern unsigned long __meminitdata pgt_buf_end; extern unsigned long __meminitdata pgt_buf_top; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 98ac76dc4eae..dd1a88832d25 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,7 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; -void probe_page_size_mask(void); +void init_mem_mapping(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 01fb5f9baf90..23b079fb93fc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,34 +913,9 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); init_gbpages(); - probe_page_size_mask(); - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { - - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; - - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } - - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 701abbc24735..9e17f9e18a21 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -37,7 +37,7 @@ struct map_range { static int page_size_mask; -void probe_page_size_mask(void) +static void __init probe_page_size_mask(void) { #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* @@ -315,6 +315,23 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, return ret >> PAGE_SHIFT; } +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* max_pfn_mapped is updated here */ + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< Date: Fri, 16 Nov 2012 19:38:42 -0800 Subject: x86, mm: Revert back good_end setting for 64bit After | commit 8548c84da2f47e71bbbe300f55edb768492575f7 | Author: Takashi Iwai | Date: Sun Oct 23 23:19:12 2011 +0200 | | x86: Fix S4 regression | | Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4 | regression since 2.6.39, namely the machine reboots occasionally at S4 | resume. It doesn't happen always, overall rate is about 1/20. But, | like other bugs, once when this happens, it continues to happen. | | This patch fixes the problem by essentially reverting the memory | assignment in the older way. Have some page table around 512M again, that will prevent kdump to find 512M under 768M. We need revert that reverting, so we could put page table high again for 64bit. Takashi agreed that S4 regression could be something else. https://lkml.org/lkml/2012/6/15/182 Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-6-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9e17f9e18a21..dbef4ffe8d31 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -234,8 +234,8 @@ static void __init find_early_table_space(struct map_range *mr, int nr_range) #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif good_end = max_pfn_mapped << PAGE_SHIFT; +#endif base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) -- cgit v1.2.3 From 84f1ae30bb68d8da98bca7ff2c2b825b2ac8c9a5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:43 -0800 Subject: x86, mm: Change find_early_table_space() paramters call split_mem_range inside the function. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-7-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dbef4ffe8d31..51f919febf64 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -196,12 +196,18 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB * pages. Then find enough contiguous space for those page tables. */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) +static void __init find_early_table_space(unsigned long start, unsigned long end) { int i; unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; + unsigned long good_end; phys_addr_t base; + struct map_range mr[NR_RANGE_MR]; + int nr_range; + + memset(mr, 0, sizeof(mr)); + nr_range = 0; + nr_range = split_mem_range(mr, nr_range, start, end); for (i = 0; i < nr_range; i++) { unsigned long range, extra; @@ -276,7 +282,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(mr, nr_range); + find_early_table_space(start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- cgit v1.2.3 From c14fa0b63b5b4234667c03fdc3314c0881caa514 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:44 -0800 Subject: x86, mm: Find early page table buffer together We should not do that in every calling of init_memory_mapping. At the same time need to move down early_memtest, and could remove after_bootmem checking. -v2: fix one early_memtest with 32bit by passing max_pfn_mapped instead. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-8-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 66 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 51f919febf64..1ce0d033fafc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -274,16 +274,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = split_mem_range(mr, 0, start, end); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(start, end); - for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); @@ -296,6 +286,36 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + return ret >> PAGE_SHIFT; +} + +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ +#ifdef CONFIG_X86_64 + find_early_table_space(0, max_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< pgt_buf_start) + if (pgt_buf_end > pgt_buf_start) x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), PFN_PHYS(pgt_buf_end)); - if (!after_bootmem) - early_memtest(start, end); + /* stop the wrong using */ + pgt_buf_top = 0; - return ret >> PAGE_SHIFT; -} - -void __init init_mem_mapping(void) -{ - probe_page_size_mask(); - - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn< Date: Fri, 16 Nov 2012 19:38:45 -0800 Subject: x86, mm: Separate out calculate_table_space_size() It should take physical address range that will need to be mapped. find_early_table_space should take range that pgt buff should be in. Separating page table size calculating and finding early page table to reduce confusing. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-9-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1ce0d033fafc..7b961d0b1389 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -196,12 +196,10 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB * pages. Then find enough contiguous space for those page tables. */ -static void __init find_early_table_space(unsigned long start, unsigned long end) +static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) { int i; unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long good_end; - phys_addr_t base; struct map_range mr[NR_RANGE_MR]; int nr_range; @@ -240,9 +238,17 @@ static void __init find_early_table_space(unsigned long start, unsigned long end #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); - good_end = max_pfn_mapped << PAGE_SHIFT; #endif + return tables; +} + +static void __init find_early_table_space(unsigned long start, + unsigned long good_end, + unsigned long tables) +{ + phys_addr_t base; + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) panic("Cannot find space for the kernel page tables"); @@ -250,10 +256,6 @@ static void __init find_early_table_space(unsigned long start, unsigned long end pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); } /* @@ -291,6 +293,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, void __init init_mem_mapping(void) { + unsigned long tables, good_end, end; + probe_page_size_mask(); /* @@ -301,10 +305,18 @@ void __init init_mem_mapping(void) * nodes are discovered. */ #ifdef CONFIG_X86_64 - find_early_table_space(0, max_pfn< pgt_buf_start) + if (pgt_buf_end > pgt_buf_start) { + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", + end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_end << PAGE_SHIFT) - 1); x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), PFN_PHYS(pgt_buf_end)); + } /* stop the wrong using */ pgt_buf_top = 0; -- cgit v1.2.3 From 66520ebc2df3fe52eb4792f8101fac573b766baf Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Fri, 16 Nov 2012 19:38:52 -0800 Subject: x86, mm: Only direct map addresses that are marked as E820_RAM Currently direct mappings are created for [ 0 to max_low_pfn< Link: http://lkml.kernel.org/r/1353123563-3103-16-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 +-- arch/x86/kernel/setup.c | 8 ++- arch/x86/mm/init.c | 120 ++++++++++++++++++++++++++++++++++---- arch/x86/mm/init_64.c | 6 +- 4 files changed, 117 insertions(+), 25 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 45aae6e5f649..54c97879195e 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } -static inline bool pfn_range_is_mapped(unsigned long start_pfn, - unsigned long end_pfn) -{ - return end_pfn <= max_low_pfn_mapped || - (end_pfn > (1UL << (32 - PAGE_SHIFT)) && - end_pfn <= max_pfn_mapped); -} +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bd52f9da17cc..68dffeceb193 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -116,9 +116,11 @@ #include /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7b961d0b1389..bb44e9f2cc49 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -243,6 +243,38 @@ static unsigned long __init calculate_table_space_size(unsigned long start, unsi return tables; } +static unsigned long __init calculate_all_table_space_size(void) +{ + unsigned long start_pfn, end_pfn; + unsigned long tables; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + tables = calculate_table_space_size(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = start_pfn << PAGE_SHIFT; + u64 end = end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + tables += calculate_table_space_size(start, end); + } + + return tables; +} + static void __init find_early_table_space(unsigned long start, unsigned long good_end, unsigned long tables) @@ -258,6 +290,34 @@ static void __init find_early_table_space(unsigned long start, pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); } +static struct range pfn_mapped[E820_X_MAX]; +static int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -288,9 +348,55 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); + return ret >> PAGE_SHIFT; } +/* + * Iterate through E820 memory map and create direct mappings for only E820_RAM + * regions. We cannot simply create direct mappings for all pfns from + * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in + * high addresses that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result in using + * smaller size (i.e. 4K instead of 2M or 1G) page tables. + */ +static void __init init_all_memory_mapping(void) +{ + unsigned long start_pfn, end_pfn; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = (u64)start_pfn << PAGE_SHIFT; + u64 end = (u64)end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + init_memory_mapping(start, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif +} + void __init init_mem_mapping(void) { unsigned long tables, good_end, end; @@ -311,23 +417,15 @@ void __init init_mem_mapping(void) end = max_low_pfn << PAGE_SHIFT; good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_table_space_size(0, end); + tables = calculate_all_table_space_size(); find_early_table_space(0, good_end, tables); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); -- cgit v1.2.3 From aeebe84cc96cde4181807bc67c300c550d0ef123 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:55 -0800 Subject: x86, mm: Use big page size for small memory range We could map small range in the middle of big range at first, so should use big page size at first to avoid using small page size to break down page table. Only can set big page bit when that range has ram area around it. -v2: fix 32bit boundary checking. We can not count ram above max_low_pfn for 32 bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-19-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bb44e9f2cc49..da591ebc8d12 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -88,6 +88,40 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } +/* + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. + */ +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, + int nr_range) +{ + int i; + + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<> PAGE_SHIFT) > max_low_pfn) + continue; +#endif + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1< Date: Fri, 16 Nov 2012 19:38:57 -0800 Subject: x86, mm: Break down init_all_memory_mapping Will replace that with top-down page table initialization. New API need to take range: init_range_memory_mapping() Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-21-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index da591ebc8d12..c688ea3887f2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -398,40 +398,30 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * Depending on the alignment of E820 ranges, this may possibly result in using * smaller size (i.e. 4K instead of 2M or 1G) page tables. */ -static void __init init_all_memory_mapping(void) +static void __init init_range_memory_mapping(unsigned long range_start, + unsigned long range_end) { unsigned long start_pfn, end_pfn; int i; - /* the ISA range is always mapped regardless of memory holes */ - init_memory_mapping(0, ISA_END_ADDRESS); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { u64 start = (u64)start_pfn << PAGE_SHIFT; u64 end = (u64)end_pfn << PAGE_SHIFT; - if (end <= ISA_END_ADDRESS) + if (end <= range_start) continue; - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) + if (start < range_start) + start = range_start; + + if (start >= range_end) continue; - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif - init_memory_mapping(start, end); - } + if (end > range_end) + end = range_end; -#ifdef CONFIG_X86_64 - if (max_pfn > max_low_pfn) { - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; + init_memory_mapping(start, end); } -#endif } void __init init_mem_mapping(void) @@ -461,8 +451,15 @@ void __init init_mem_mapping(void) (pgt_buf_top << PAGE_SHIFT) - 1); max_pfn_mapped = 0; /* will get exact value next */ - init_all_memory_mapping(); - + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + init_range_memory_mapping(ISA_END_ADDRESS, end); +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif /* * Reserve the kernel pagetable pages we used (pgt_buf_start - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) -- cgit v1.2.3 From 8d57470d8f859635deffe3919d7d4867b488b85a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:58 -0800 Subject: x86, mm: setup page table in top-down Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first. Then use mapped pages to map more ranges below, and keep looping until all pages get mapped. alloc_low_page will use page from BRK at first, after that buffer is used up, will use memblock to find and reserve pages for page table usage. Introduce min_pfn_mapped to make sure find new pages from mapped ranges, that will be updated when lower pages get mapped. Also add step_size to make sure that don't try to map too big range with limited mapped pages initially, and increase the step_size when we have more mapped pages on hand. We don't need to call pagetable_reserve anymore, reserve work is done in alloc_low_page() directly. At last we can get rid of calculation and find early pgt related code. -v2: update to after fix_xen change, also use MACRO for initial pgt_buf size and add comments with it. -v3: skip big reserved range in memblock.reserved near end. -v4: don't need fix_xen change now. -v5: add changelog about moving about reserving pagetable to alloc_low_page. Suggested-by: "H. Peter Anvin" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-22-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 + arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup.c | 3 + arch/x86/mm/init.c | 210 +++++++++++--------------------------- arch/x86/mm/init_32.c | 17 ++- arch/x86/mm/init_64.c | 17 ++- 6 files changed, 94 insertions(+), 155 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195e..9f6f3e66e84d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd1a88832d25..6991a3e1bf81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd) extern int direct_gbpages; void init_mem_mapping(void); +void early_alloc_pgt_buf(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 94f922a73c54..f7634092931b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,6 +124,7 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); @@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c688ea3887f2..2393d0099e7f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; + + base = __pa(extend_brk(tables, PAGE_SIZE)); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + int after_bootmem; int direct_gbpages @@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - struct map_range mr[NR_RANGE_MR]; - int nr_range; - - memset(mr, 0, sizeof(mr)); - nr_range = 0; - nr_range = split_mem_range(mr, nr_range, start, end); - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - return tables; -} - -static unsigned long __init calculate_all_table_space_size(void) -{ - unsigned long start_pfn, end_pfn; - unsigned long tables; - int i; - - /* the ISA range is always mapped regardless of memory holes */ - tables = calculate_table_space_size(0, ISA_END_ADDRESS); - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = start_pfn << PAGE_SHIFT; - u64 end = end_pfn << PAGE_SHIFT; - - if (end <= ISA_END_ADDRESS) - continue; - - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) - continue; - - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif - tables += calculate_table_space_size(start, end); - } - - return tables; -} - -static void __init find_early_table_space(unsigned long start, - unsigned long good_end, - unsigned long tables) -{ - phys_addr_t base; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); -} - static struct range pfn_mapped[E820_X_MAX]; static int nr_pfn_mapped; @@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } /* - * Iterate through E820 memory map and create direct mappings for only E820_RAM - * regions. We cannot simply create direct mappings for all pfns from - * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in - * high addresses that cannot be marked as UC by fixed/variable range MTRRs. - * Depending on the alignment of E820 ranges, this may possibly result in using - * smaller size (i.e. 4K instead of 2M or 1G) page tables. + * would have hole in the middle or ends, and only ram parts will be mapped. */ -static void __init init_range_memory_mapping(unsigned long range_start, +static unsigned long __init init_range_memory_mapping( + unsigned long range_start, unsigned long range_end) { unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { @@ -421,71 +334,70 @@ static void __init init_range_memory_mapping(unsigned long range_start, end = range_end; init_memory_mapping(start, end); + + mapped_ram_size += end - start; } + + return mapped_ram_size; } +/* (PUD_SHIFT-PMD_SHIFT)/2 */ +#define STEP_SIZE_SHIFT 5 void __init init_mem_mapping(void) { - unsigned long tables, good_end, end; + unsigned long end, real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; probe_page_size_mask(); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; - good_end = end; #else end = max_low_pfn << PAGE_SHIFT; - good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_all_table_space_size(); - find_early_table_space(0, good_end, tables); - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); - max_pfn_mapped = 0; /* will get exact value next */ /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - init_range_memory_mapping(ISA_END_ADDRESS, end); + + /* xen has big range in reserved near end of ram, skip it at first */ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, + PAGE_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + while (last_start > ISA_END_ADDRESS) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; + } else + start = ISA_END_ADDRESS; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size <<= STEP_SIZE_SHIFT; + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < end) + init_range_memory_mapping(real_end, end); + #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } #endif - /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. - * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. - * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. - */ - if (pgt_buf_end > pgt_buf_start) { - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_end << PAGE_SHIFT) - 1); - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); - } - - /* stop the wrong using */ - pgt_buf_top = 0; - early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27f7fc69cf8a..7bb11064a9e1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = __va(pfn * PAGE_SIZE); clear_page(adr); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fa28e3e29741..eefaea637b3d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -316,7 +316,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; if (after_bootmem) { @@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); clear_page(adr); -- cgit v1.2.3 From 5c51bdbe4c74dce7996d0bbfa39974775cc3f13c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:01 -0800 Subject: x86, mm: Merge alloc_low_page between 64bit and 32bit They are almost same except 64 bit need to handle after_bootmem case. Add mm_internal.h to make that alloc_low_page() only to be accessible from arch/x86/mm/init*.c Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-25-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 26 ++------------------------ arch/x86/mm/init_64.c | 32 ++------------------------------ arch/x86/mm/mm_internal.h | 6 ++++++ 4 files changed, 44 insertions(+), 54 deletions(-) create mode 100644 arch/x86/mm/mm_internal.h (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2393d0099e7f..848189229b2d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,10 +17,44 @@ #include #include /* for MAX_DMA_PFN */ +#include "mm_internal.h" + unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +__ref void *alloc_low_page(void) +{ + unsigned long pfn; + void *adr; + +#ifdef CONFIG_X86_64 + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + + return adr; + } +#endif + + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; + + adr = __va(pfn * PAGE_SIZE); + clear_page(adr); + return adr; +} + /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ #define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7bb11064a9e1..a7f2df1cdcfd 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -53,36 +53,14 @@ #include #include +#include "mm_internal.h" + unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ - unsigned long pfn; - void *adr; - - if ((pgt_buf_end + 1) >= pgt_buf_top) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); - if (!ret) - panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); - pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 19608203fa5d..1d53defc99c9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,8 @@ #include #include +#include "mm_internal.h" + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -314,36 +316,6 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(void) -{ - unsigned long pfn; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - - return adr; - } - - if ((pgt_buf_end + 1) >= pgt_buf_top) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); - if (!ret) - panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); - pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 000000000000..b3f993a2555e --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,6 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_page(void); + +#endif /* __X86_MM_INTERNAL_H */ -- cgit v1.2.3 From 9985b4c6fa7d660f685918a58282275e9e35d8e0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:02 -0800 Subject: x86, mm: Move min_pfn_mapped back to mm/init.c Also change it to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/mm/init.c | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9f6f3e66e84d..54c97879195e 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,7 +45,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7634092931b..20151941cce8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,7 +124,6 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; -unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 848189229b2d..6392bf9a3947 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -23,6 +23,8 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +static unsigned long min_pfn_mapped; + __ref void *alloc_low_page(void) { unsigned long pfn; -- cgit v1.2.3 From 6f80b68e9e515547edbacb0c37491730bf766db5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:03 -0800 Subject: x86, mm, Xen: Remove mapping_pagetable_reserve() Page table area are pre-mapped now after x86, mm: setup page table in top-down x86, mm: Remove early_memremap workaround for page table accessing on 64bit mapping_pagetable_reserve is not used anymore, so remove it. Also remove operation in mask_rw_pte(), as modified allow_low_page always return pages that are already mapped, moreover xen_alloc_pte_init, xen_alloc_pmd_init, etc, will mark the page RO before hooking it into the pagetable automatically. -v2: add changelog about mask_rw_pte() from Stefano. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-27-git-send-email-yinghai@kernel.org Cc: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_types.h | 1 - arch/x86/include/asm/x86_init.h | 12 ------------ arch/x86/kernel/x86_init.c | 4 ---- arch/x86/mm/init.c | 4 ---- arch/x86/xen/mmu.c | 28 ---------------------------- 5 files changed, 49 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505d..79738f20aaf5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -301,7 +301,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 57693498519c..3b2ce8fc995a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,17 +68,6 @@ struct x86_init_oem { void (*banner)(void); }; -/** - * struct x86_init_mapping - platform specific initial kernel pagetable setup - * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage - * - * For more details on the purpose of this hook, look in - * init_memory_mapping and the commit that added it. - */ -struct x86_init_mapping { - void (*pagetable_reserve)(u64 start, u64 end); -}; - /** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call to setup @@ -136,7 +125,6 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; - struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7a3d075a814a..50cf83ecd32e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6392bf9a3947..21173fcdb4a1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -112,10 +112,6 @@ static void __init probe_page_size_mask(void) __supported_pte_mask |= _PAGE_GLOBAL; } } -void __init native_pagetable_reserve(u64 start, u64 end) -{ - memblock_reserve(start, end - start); -} #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dcf5f2dd91ec..bbb883f58bc4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -{ - /* reserve the range used */ - native_pagetable_reserve(start, end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, - PFN_PHYS(pgt_buf_top)); - while (end < PFN_PHYS(pgt_buf_top)) { - make_lowmem_page_readwrite(__va(end)); - end += PAGE_SIZE; - } -} - #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - /* - * If the new pfn is within the range of the newly allocated - * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot as a freshly allocated page, make sure - * it is RO. - */ - if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_top)) || - (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) - pte = pte_wrprotect(pte); - return pte; } #endif /* CONFIG_X86_64 */ @@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; -- cgit v1.2.3 From 22c8ca2ac256bb681be791858b35502b5d37e73b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:04 -0800 Subject: x86, mm: Add alloc_low_pages(num) 32bit kmap mapping needs pages to be used for low to high. At this point those pages are still from pgt_buf_* from BRK, so it is ok now. But we want to move early_ioremap_page_table_range_init() out of init_memory_mapping() and only call it one time later, that will make page_table_range_init/page_table_kmap_check/alloc_low_page to use memblock to get page. memblock allocation for pages are from high to low. So will get panic from page_table_kmap_check() that has BUG_ON to do ordering checking. This patch add alloc_low_pages to make it possible to allocate serveral pages at first, and hand out pages one by one from low to high. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-28-git-send-email-yinghai@kernel.org Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 33 +++++++++++++++++++++------------ arch/x86/mm/mm_internal.h | 6 +++++- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 21173fcdb4a1..02cea14c6d0c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,36 +25,45 @@ unsigned long __meminitdata pgt_buf_top; static unsigned long min_pfn_mapped; -__ref void *alloc_low_page(void) +__ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; - void *adr; + int i; #ifdef CONFIG_X86_64 if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + unsigned int order; - return adr; + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | + __GFP_ZERO, order); } #endif - if ((pgt_buf_end + 1) >= pgt_buf_top) { + if ((pgt_buf_end + num) >= pgt_buf_top) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) panic("alloc_low_page: ran out of memory"); ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); + PAGE_SIZE * num , PAGE_SIZE); if (!ret) panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); + memblock_reserve(ret, PAGE_SIZE * num); pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + } + + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); + } - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; + return __va(pfn << PAGE_SHIFT); } /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index b3f993a2555e..7e3b88ee078a 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -1,6 +1,10 @@ #ifndef __X86_MM_INTERNAL_H #define __X86_MM_INTERNAL_H -void *alloc_low_page(void); +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} #endif /* __X86_MM_INTERNAL_H */ -- cgit v1.2.3 From ddd3509df8f8d4f1cf4784f559d702ce00dc8846 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 16 Nov 2012 19:39:05 -0800 Subject: x86, mm: Add pointer about Xen mmu requirement for alloc_low_pages Add link for more information 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve -v2: updated to commets from hpa to include commit name. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-29-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 02cea14c6d0c..cb4f8ba70ecc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,15 @@ unsigned long __meminitdata pgt_buf_top; static unsigned long min_pfn_mapped; +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ __ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; -- cgit v1.2.3 From 719272c45b821d38608fc333700bde1a89c56c59 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:06 -0800 Subject: x86, mm: only call early_ioremap_page_table_range_init() once On 32bit, before patcheset that only set page table for ram, we only call that one time. Now, we are calling that during every init_memory_mapping if we have holes under max_low_pfn. We should only call it one time after all ranges under max_low_page get mapped just like we did before. Also that could avoid the risk to run out of pgt_buf in BRK. Need to update page_table_range_init() to count the pages for kmap page table at first, and use new added alloc_low_pages() to get pages in sequence. That will conform to the requirement that pages need to be in low to high order. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-30-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 13 +++++-------- arch/x86/mm/init_32.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cb4f8ba70ecc..bed4888c6f4f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -343,14 +343,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#ifdef CONFIG_X86_32 - early_ioremap_page_table_range_init(); - - load_cr3(swapper_pg_dir); -#endif - - __flush_tlb_all(); - add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); return ret >> PAGE_SHIFT; @@ -447,7 +439,12 @@ void __init init_mem_mapping(void) /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } +#else + early_ioremap_page_table_range_init(); + load_cr3(swapper_pg_dir); + __flush_tlb_all(); #endif + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a7f2df1cdcfd..0ae1ba8bc1b9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -135,8 +135,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) return one_page_table_init(pmd) + pte_idx; } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, - unsigned long vaddr, pte_t *lastpte) + unsigned long vaddr, pte_t *lastpte, + void **adr) { #ifdef CONFIG_HIGHMEM /* @@ -150,16 +181,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin - && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start - || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { pte_t *newpte; int i; BUG_ON(after_bootmem); - newpte = alloc_low_page(); + newpte = *adr; for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -193,6 +223,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); vaddr = start; pgd_idx = pgd_index(vaddr); @@ -205,7 +240,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { pte = page_table_kmap_check(one_page_table_init(pmd), - pmd, vaddr, pte); + pmd, vaddr, pte, &adr); vaddr += PMD_SIZE; } -- cgit v1.2.3 From cf47065961b48727b4e47bc3e2e67f4996878437 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:07 -0800 Subject: x86, mm: Move back pgt_buf_* to mm/init.c Also change them to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-31-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 4 ---- arch/x86/mm/init.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 4f13998be59a..626ea8d31b6d 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,8 +12,4 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); -extern unsigned long __initdata pgt_buf_start; -extern unsigned long __meminitdata pgt_buf_end; -extern unsigned long __meminitdata pgt_buf_top; - #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bed4888c6f4f..3cadf1013cea 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -19,9 +19,9 @@ #include "mm_internal.h" -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; -- cgit v1.2.3 From 148b20989e0b83cb301e1fcd9e987c7abde05333 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:08 -0800 Subject: x86, mm: Move init_gbpages() out of setup.c Put it in mm/init.c, and call it from probe_page_mask(). init_mem_mapping is calling probe_page_mask at first. So calling sequence is not changed. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-32-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 15 +-------------- arch/x86/mm/init.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 20151941cce8..85b62f1c8071 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -282,18 +282,7 @@ void * __init extend_brk(size_t size, size_t align) return ret; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} +#ifdef CONFIG_X86_32 static void __init cleanup_highmap(void) { } @@ -933,8 +922,6 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - init_gbpages(); - init_mem_mapping(); memblock.current_limit = get_max_mapped(); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3cadf1013cea..8168bf8fcda7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -98,6 +98,16 @@ int direct_gbpages #endif ; +static void __init init_gbpages(void) +{ +#ifdef CONFIG_X86_64 + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif +} + struct map_range { unsigned long start; unsigned long end; @@ -108,6 +118,8 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { + init_gbpages(); + #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. -- cgit v1.2.3 From 5a0d3aeeeffbd1534a510fc10c4ab7c99c45afce Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:12 -0800 Subject: x86, mm: use round_up/down in split_mem_range() to replace own inline version for those roundup and rounddown. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-36-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8168bf8fcda7..0e625e606e5d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -218,13 +218,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * slowdowns. */ if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + end_pfn = PMD_SIZE >> PAGE_SHIFT; else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; @@ -234,15 +232,13 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; + if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT)) + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; #endif if (start_pfn < end_pfn) { @@ -253,9 +249,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #ifdef CONFIG_X86_64 /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; + end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & @@ -264,9 +259,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:13 -0800 Subject: x86, mm: use PFN_DOWN in split_mem_range() to replace own inline version for shifting. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-37-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0e625e606e5d..1cca052b2cbd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -208,8 +208,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, int i; /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; + start_pfn = PFN_DOWN(start); + pos = PFN_PHYS(start_pfn); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -218,59 +218,59 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * slowdowns. */ if (pos == 0) - end_pfn = PMD_SIZE >> PAGE_SHIFT; + end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; + if (end_pfn > PFN_DOWN(end)) + end_pfn = PFN_DOWN(end); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; + pos = PFN_PHYS(end_pfn); } /* big page (2M) range */ - start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; - if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT)) - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<> PAGE_SHIFT; - end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + end_pfn = PFN_DOWN(round_down(end, PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<> PAGE_SHIFT; - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; + start_pfn = PFN_DOWN(pos); + end_pfn = PFN_DOWN(end); nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ -- cgit v1.2.3 From 1829ae9ad7380bf17333ab9ad1610631d9cb8664 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:14 -0800 Subject: x86, mm: use pfn instead of pos in split_mem_range could save some bit shifting operations. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-38-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1cca052b2cbd..4bf1c5374928 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -204,12 +204,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long end) { unsigned long start_pfn, end_pfn; - unsigned long pos; + unsigned long pfn; int i; /* head if not big page alignment ? */ - start_pfn = PFN_DOWN(start); - pos = PFN_PHYS(start_pfn); + pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -217,26 +216,26 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (pos == 0) + if (pfn == 0) end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif if (end_pfn > PFN_DOWN(end)) end_pfn = PFN_DOWN(end); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = PFN_PHYS(end_pfn); + pfn = end_pfn; } /* big page (2M) range */ - start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #endif @@ -244,32 +243,32 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:15 -0800 Subject: x86, mm: use limit_pfn for end pfn instead of shifting end to get that. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-39-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4bf1c5374928..f410dc6f843e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -203,10 +203,12 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) { - unsigned long start_pfn, end_pfn; + unsigned long start_pfn, end_pfn, limit_pfn; unsigned long pfn; int i; + limit_pfn = PFN_DOWN(end); + /* head if not big page alignment ? */ pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 @@ -223,8 +225,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif - if (end_pfn > PFN_DOWN(end)) - end_pfn = PFN_DOWN(end); + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pfn = end_pfn; @@ -233,11 +235,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, /* big page (2M) range */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); - if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { @@ -249,7 +251,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #ifdef CONFIG_X86_64 /* big page (1G) range */ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); - end_pfn = PFN_DOWN(round_down(end, PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & @@ -259,7 +261,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, /* tail is not big page (1G) alignment */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:16 -0800 Subject: x86, mm: Unifying after_bootmem for 32bit and 64bit after_bootmem has different meaning in 32bit and 64bit. 32bit: after bootmem is ready 64bit: after bootmem is distroyed Let's merget them make 32bit the same as 64bit. for 32bit, it is mixing alloc_bootmem_pages, and alloc_low_page under after_bootmem is set or not set. alloc_bootmem is just wrapper for memblock for x86. Now we have alloc_low_page() with memblock too. We can drop bootmem path now, and only alloc_low_page only. At the same time, we make alloc_low_page could handle real after_bootmem for 32bit, because alloc_bootmem_pages could fallback to use slab too. At last move after_bootmem set position for 32bit the same as 64bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-40-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 2 -- arch/x86/mm/init_32.c | 21 ++++----------------- 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f410dc6f843e..2a27e5ac1e3c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -39,7 +39,6 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long pfn; int i; -#ifdef CONFIG_X86_64 if (after_bootmem) { unsigned int order; @@ -47,7 +46,6 @@ __ref void *alloc_low_pages(unsigned int num) return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO, order); } -#endif if ((pgt_buf_end + num) >= pgt_buf_top) { unsigned long ret; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 19ef9f018012..f4fc4a28393a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -73,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); - else - pmd_table = (pmd_t *)alloc_low_page(); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -98,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = NULL; - - if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif - if (!page_table) - page_table = - (pte_t *)alloc_bootmem_pages(PAGE_SIZE); - } else - page_table = (pte_t *)alloc_low_page(); + pte_t *page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -708,8 +695,6 @@ void __init setup_bootmem_allocator(void) printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< Date: Fri, 16 Nov 2012 19:39:18 -0800 Subject: x86, mm: Use clamp_t() in init_range_memory_mapping save some lines, and make code more readable. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-42-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2a27e5ac1e3c..6f85de8a1f28 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -357,31 +357,20 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * would have hole in the middle or ends, and only ram parts will be mapped. */ static unsigned long __init init_range_memory_mapping( - unsigned long range_start, - unsigned long range_end) + unsigned long r_start, + unsigned long r_end) { unsigned long start_pfn, end_pfn; unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = (u64)start_pfn << PAGE_SHIFT; - u64 end = (u64)end_pfn << PAGE_SHIFT; - - if (end <= range_start) - continue; - - if (start < range_start) - start = range_start; - - if (start >= range_end) + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) continue; - if (end > range_end) - end = range_end; - init_memory_mapping(start, end); - mapped_ram_size += end - start; } -- cgit v1.2.3 From c9b3234a6abadaa12684083d39552939baaed1f4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:42 -0800 Subject: x86, mm: Fix page table early allocation offset checking During debugging loading kernel above 4G, found that one page is not used in pre-allocated BRK area for early page allocation. pgt_buf_top is address that can not be used, so should check if that new end is above that top, otherwise last page will not be used. Fix that checking and also add print out for allocation from pre-allocated BRK area to catch possible bugs later. But after we get back that page for pgt, it tiggers one bug in pgt allocation with xen: We need to avoid to use page as pgt to map range that is overlapping with that pgt page. Add checking about overlapping, when it happens, use memblock allocation instead. That fixes crash on Xen PV guest with 2G that Stefan found. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-2-git-send-email-yinghai@kernel.org Acked-by: Stefano Stabellini Tested-by: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6f85de8a1f28..78d1ef3eab66 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,8 @@ static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; +static bool __initdata can_use_brk_pgt = true; + /* * Pages returned are already directly mapped. * @@ -47,7 +49,7 @@ __ref void *alloc_low_pages(unsigned int num) __GFP_ZERO, order); } - if ((pgt_buf_end + num) >= pgt_buf_top) { + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) panic("alloc_low_page: ran out of memory"); @@ -61,6 +63,8 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; + printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", + pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { @@ -370,8 +374,15 @@ static unsigned long __init init_range_memory_mapping( if (start >= end) continue; + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<= + min(end, (u64)pgt_buf_top< Date: Thu, 24 Jan 2013 12:19:52 -0800 Subject: x86, 64bit: Use a #PF handler to materialize early mappings on demand Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all 64-bit code has to use page tables. This makes it awkward before we have first set up properly all-covering page tables to access objects that are outside the static kernel range. So far we have dealt with that simply by mapping a fixed amount of low memory, but that fails in at least two upcoming use cases: 1. We will support load and run kernel, struct boot_params, ramdisk, command line, etc. above the 4 GiB mark. 2. need to access ramdisk early to get microcode to update that as early possible. We could use early_iomap to access them too, but it will make code to messy and hard to be unified with 32 bit. Hence, set up a #PF table and use a fixed number of buffers to set up page tables on demand. If the buffers fill up then we simply flush them and start over. These buffers are all in __initdata, so it does not increase RAM usage at runtime. Thus, with the help of the #PF handler, we can set the final kernel mapping from blank, and switch to init_level4_pgt later. During the switchover in head_64.S, before #PF handler is available, we use three pages to handle kernel crossing 1G, 512G boundaries with sharing page by playing games with page aliasing: the same page is mapped twice in the higher-level tables with appropriate wraparound. The kernel region itself will be properly mapped; other mappings may be spurious. early_make_pgtable is using kernel high mapping address to access pages to set page table. -v4: Add phys_base offset to make kexec happy, and add init_mapping_kernel() - Yinghai -v5: fix compiling with xen, and add back ident level3 and level2 for xen also move back init_level4_pgt from BSS to DATA again. because we have to clear it anyway. - Yinghai -v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai -v7: remove not needed clear_page for init_level4_page it is with fill 512,8,0 already in head_64.S - Yinghai -v8: we need to keep that handler alive until init_mem_mapping and don't let early_trap_init to trash that early #PF handler. So split early_trap_pf_init out and move it down. - Yinghai -v9: switchover only cover kernel space instead of 1G so could avoid touch possible mem holes. - Yinghai -v11: change far jmp back to far return to initial_code, that is needed to fix failure that is reported by Konrad on AMD systems. - Yinghai Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_64_types.h | 4 + arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/head64.c | 81 ++++++++++-- arch/x86/kernel/head_64.S | 210 +++++++++++++++++++------------- arch/x86/kernel/setup.c | 2 + arch/x86/kernel/traps.c | 9 ++ arch/x86/mm/init.c | 3 +- 7 files changed, 219 insertions(+), 91 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbbd..2d883440cb9a 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H +#include + #ifndef __ASSEMBLY__ #include @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define EARLY_DYNAMIC_PAGE_TABLES 64 + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 888184b2fc85..bdee8bd318ea 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -731,6 +731,7 @@ extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void early_trap_init(void); +void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7785e66840a4..f57df05ea126 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,11 +27,73 @@ #include #include -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + + i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); + pgd_p = &early_level4_pgt[i].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + } else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + reset_early_page_tables(); + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); + pud_p += i; + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_p[i] = pmd; + pmd += PMD_SIZE; + } + + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - + /* XXX - this is wrong... we need to build page tables from scratch */ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { @@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data) if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9cc..d94f6d68be2a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,45 +78,62 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete + shrq $PGDIR_SHIFT, %rax - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + addq $4096, %rdx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + + addq $8192, %rbx + movq %rdi, %rax + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b /* * Fixup the kernel text+data virtual addresses. Note that @@ -124,7 +141,6 @@ ident_complete: * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -139,17 +155,14 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f ENTRY(secondary_startup_64) /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax - movq %rax, %cr4 + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax addq phys_base(%rip), %rax movq %rax, %cr3 @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. */ movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder @@ -270,13 +303,13 @@ ENDPROC(start_cpu0) /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -284,7 +317,7 @@ ENDPROC(start_cpu0) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -321,14 +354,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -350,7 +391,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -364,6 +405,8 @@ ENTRY(early_idt_handler) decl early_recursion_flag(%rip) INTERRUPT_RETURN + __INITDATA + .balign 4 early_recursion_flag: .long 0 @@ -374,11 +417,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -388,24 +430,37 @@ ENTRY(name) i = i + 1 ; \ .endr + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ + +#ifndef CONFIG_XEN NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -472,6 +517,5 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85a8290801df..db9c41dae8d7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1005,6 +1005,8 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); + early_trap_pf_init(); + setup_real_mode(); memblock.current_limit = get_max_mapped(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e9..68bda7a84159 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -688,10 +688,19 @@ void __init early_trap_init(void) set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, &page_fault); +#endif load_idt(&idt_descr); } +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, &page_fault); +#endif +} + void __init trap_init(void) { int i; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 78d1ef3eab66..3364a7643a4c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -446,9 +446,10 @@ void __init init_mem_mapping(void) } #else early_ioremap_page_table_range_init(); +#endif + load_cr3(swapper_pg_dir); __flush_tlb_all(); -#endif early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } -- cgit v1.2.3 From 0e691cf824f76adefb4498fe39c300aba2c2575a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:05 -0800 Subject: x86, kexec, 64bit: Only set ident mapping for ram. We should set mappings only for usable memory ranges under max_pfn Otherwise causes same problem that is fixed by x86, mm: Only direct map addresses that are marked as E820_RAM This patch exposes pfn_mapped array, and only sets ident mapping for ranges in that array. This patch relies on new kernel_ident_mapping_init that could handle existing pgd/pud between different calls. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-25-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page.h | 4 ++++ arch/x86/kernel/machine_kexec_64.c | 13 +++++++++---- arch/x86/mm/init.c | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca82839288a..100a20c7b98d 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -17,6 +17,10 @@ struct page; +#include +extern struct range pfn_mapped[]; +extern int nr_pfn_mapped; + static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index d2d7e023a8c8..4eabc160696f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -100,10 +100,15 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - result = kernel_ident_mapping_init(&info, level4p, - 0, max_pfn << PAGE_SHIFT); - if (result) - return result; + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } /* * segments's mem ranges could be outside 0 ~ max_pfn, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3364a7643a4c..d41815265a0b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -302,8 +302,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -static struct range pfn_mapped[E820_X_MAX]; -static int nr_pfn_mapped; +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) { -- cgit v1.2.3 From cd745be89e1580e8a1b47454a39f97f9c5c4b1e0 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 20 Dec 2012 23:44:31 -0800 Subject: x86/mm/init.c: Copy ucode from initrd image to kernel memory Before initrd image is freed, copy valid ucode patches from initrd image to kernel memory. The saved ucode will be used to update AP in resume or hotplug. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1356075872-3054-12-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d41815265a0b..4903a03ae876 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -16,6 +16,7 @@ #include #include #include /* for MAX_DMA_PFN */ +#include #include "mm_internal.h" @@ -534,6 +535,15 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { +#ifdef CONFIG_MICROCODE_EARLY + /* + * Remember, initrd memory may contain microcode or other useful things. + * Before we lose initrd mem, we need to find a place to hold them + * now that normal virtual memory is enabled. + */ + save_microcode_in_initrd(); +#endif + /* * end could be not aligned, and We can not align that, * decompresser could be confused by aligned initrd_end -- cgit v1.2.3 From 98e7a989979b185f49e86ddaed2ad6890299d9f0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 Mar 2013 20:18:21 -0800 Subject: x86, mm: Make sure to find a 2M free block for the first mapped area Henrik reported that his MacAir 3.1 would not boot with | commit 8d57470d8f859635deffe3919d7d4867b488b85a | Date: Fri Nov 16 19:38:58 2012 -0800 | | x86, mm: setup page table in top-down It turns out that we do not calculate the real_end properly: We try to get 2M size with 4K alignment, and later will round down to 2M, so we will get less then 2M for first mapping, in extreme case could be only 4K only. In Henrik's system it has (1M-32K) as last usable rage is [mem 0x7f9db000-0x7fef8fff]. The problem is exposed when EFI booting have several holes and it will force mapping to use PTE instead as we only map usable areas. To fix it, just make it be 2M aligned, so we can be guaranteed to be able to use large pages to map it. Reported-by: Henrik Rydberg Bisected-by: Henrik Rydberg Tested-by: Henrik Rydberg Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQX4nQ7_1kg5RL_vh56rmcSHXUi1ExrZX7CwED4NGMnHfg@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/init.c') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4903a03ae876..59b7fc453277 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -410,9 +410,8 @@ void __init init_mem_mapping(void) /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - /* xen has big range in reserved near end of ram, skip it at first */ - addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, - PAGE_SIZE); + /* xen has big range in reserved near end of ram, skip it at first.*/ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ -- cgit v1.2.3