From bd00cd5f8c8c3c282bb1e1eac6a6679a4f808091 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 24 Jun 2014 16:51:35 +0100 Subject: arm64: place initial page tables above the kernel Currently we place swapper_pg_dir and idmap_pg_dir below the kernel image, between PHYS_OFFSET and (PHYS_OFFSET + TEXT_OFFSET). However, bootloaders may use portions of this memory below the kernel and we do not parse the memory reservation list until after the MMU has been enabled. As such we may clobber some memory a bootloader wishes to have preserved. To enable the use of all of this memory by bootloaders (when the required memory reservations are communicated to the kernel) it is necessary to move our initial page tables elsewhere. As we currently have an effectively unbound requirement for memory at the end of the kernel image for .bss, we can place the page tables here. This patch moves the initial page table to the end of the kernel image, after the BSS. As they do not consist of any initialised data they will be stripped from the kernel Image as with the BSS. The BSS clearing routine is updated to stop at __bss_stop rather than _end so as to not clobber the page tables, and memory reservations made redundant by the new organisation are removed. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/arm64/mm/init.c') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f43db8a69262..7f68804814a1 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -128,20 +128,16 @@ void __init arm64_memblock_init(void) { phys_addr_t dma_phys_limit = 0; - /* Register the kernel text, kernel data and initrd with memblock */ + /* + * Register the kernel text, kernel data, initrd, and initial + * pagetables with memblock. + */ memblock_reserve(__pa(_text), _end - _text); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start) memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start); #endif - /* - * Reserve the page tables. These are already in use, - * and can only be in node 0. - */ - memblock_reserve(__pa(swapper_pg_dir), SWAPPER_DIR_SIZE); - memblock_reserve(__pa(idmap_pg_dir), IDMAP_DIR_SIZE); - early_init_fdt_scan_reserved_mem(); /* 4GB maximum for 32-bit only capable devices */ -- cgit v1.2.3 From 08375198b01001c0e43bdd580104b16b019a3754 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 16 Jul 2014 17:42:43 +0100 Subject: arm64: Determine the vmalloc/vmemmap space at build time based on VA_BITS Rather than guessing what the maximum vmmemap space should be, this patch allows the calculation based on the VA_BITS and sizeof(struct page). The vmalloc space extends to the beginning of the vmemmap space. Since the virtual kernel memory layout now depends on the build configuration, this patch removes the detailed description in Documentation/arm64/memory.txt in favour of information printed during kernel booting. Signed-off-by: Catalin Marinas Tested-by: Jungseok Lee --- Documentation/arm64/memory.txt | 98 ++++++---------------------------------- arch/arm64/include/asm/pgtable.h | 13 ++++-- arch/arm64/mm/init.c | 22 ++++++--- 3 files changed, 38 insertions(+), 95 deletions(-) (limited to 'arch/arm64/mm/init.c') diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt index 4c720d698e8e..8845d0847a66 100644 --- a/Documentation/arm64/memory.txt +++ b/Documentation/arm64/memory.txt @@ -2,19 +2,18 @@ ============================== Author: Catalin Marinas -Date : 20 February 2012 This document describes the virtual memory layout used by the AArch64 Linux kernel. The architecture allows up to 4 levels of translation tables with a 4KB page size and up to 3 levels with a 64KB page size. -AArch64 Linux uses either 3 levels or 4 levels of translation tables with -the 4KB page configuration, allowing 39-bit (512GB) or 48-bit (256TB) -virtual addresses, respectively, for both user and kernel. With 64KB -pages, only 2 levels of translation tables, allowing 42-bit (4TB) +AArch64 Linux uses either 3 levels or 4 levels of translation tables +with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit +(256TB) virtual addresses, respectively, for both user and kernel. With +64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB) virtual address, are used but the memory layout is the same. -User addresses have bits 63:39 set to 0 while the kernel addresses have +User addresses have bits 63:48 set to 0 while the kernel addresses have the same bits set to 1. TTBRx selection is given by bit 63 of the virtual address. The swapper_pg_dir contains only kernel (global) mappings while the user pgd contains only user (non-global) mappings. @@ -27,26 +26,7 @@ AArch64 Linux memory layout with 4KB pages + 3 levels: Start End Size Use ----------------------------------------------------------------------- 0000000000000000 0000007fffffffff 512GB user - -ffffff8000000000 ffffffbbfffeffff ~240GB vmalloc - -ffffffbbffff0000 ffffffbbffffffff 64KB [guard page] - -ffffffbc00000000 ffffffbdffffffff 8GB vmemmap - -ffffffbe00000000 ffffffbffbbfffff ~8GB [guard, future vmmemap] - -ffffffbffa000000 ffffffbffaffffff 16MB PCI I/O space - -ffffffbffb000000 ffffffbffbbfffff 12MB [guard] - -ffffffbffbc00000 ffffffbffbdfffff 2MB fixed mappings - -ffffffbffbe00000 ffffffbffbffffff 2MB [guard] - -ffffffbffc000000 ffffffbfffffffff 64MB modules - -ffffffc000000000 ffffffffffffffff 256GB kernel logical memory map +ffffff8000000000 ffffffffffffffff 512GB kernel AArch64 Linux memory layout with 4KB pages + 4 levels: @@ -54,26 +34,7 @@ AArch64 Linux memory layout with 4KB pages + 4 levels: Start End Size Use ----------------------------------------------------------------------- 0000000000000000 0000ffffffffffff 256TB user - -ffff000000000000 ffff7bfffffeffff ~124TB vmalloc - -ffff7bffffff0000 ffff7bffffffffff 64KB [guard page] - -ffff7c0000000000 ffff7dffffffffff 2TB vmemmap - -ffff7e0000000000 ffff7ffffbbfffff ~2TB [guard, future vmmemap] - -ffff7ffffa000000 ffff7ffffaffffff 16MB PCI I/O space - -ffff7ffffb000000 ffff7ffffbbfffff 12MB [guard] - -ffff7ffffbc00000 ffff7ffffbdfffff 2MB fixed mappings - -ffff7ffffbe00000 ffff7ffffbffffff 2MB [guard] - -ffff7ffffc000000 ffff7fffffffffff 64MB modules - -ffff800000000000 ffffffffffffffff 128TB kernel logical memory map +ffff000000000000 ffffffffffffffff 256TB kernel AArch64 Linux memory layout with 64KB pages + 2 levels: @@ -81,44 +42,14 @@ AArch64 Linux memory layout with 64KB pages + 2 levels: Start End Size Use ----------------------------------------------------------------------- 0000000000000000 000003ffffffffff 4TB user +fffffc0000000000 ffffffffffffffff 4TB kernel -fffffc0000000000 fffffdfbfffeffff ~2TB vmalloc - -fffffdfbffff0000 fffffdfbffffffff 64KB [guard page] -fffffdfc00000000 fffffdfdffffffff 8GB vmemmap +For details of the virtual kernel memory layout please see the kernel +booting log. -fffffdfe00000000 fffffdfffbbfffff ~8GB [guard, future vmmemap] -fffffdfffa000000 fffffdfffaffffff 16MB PCI I/O space - -fffffdfffb000000 fffffdfffbbfffff 12MB [guard] - -fffffdfffbc00000 fffffdfffbdfffff 2MB fixed mappings - -fffffdfffbe00000 fffffdfffbffffff 2MB [guard] - -fffffdfffc000000 fffffdffffffffff 64MB modules - -fffffe0000000000 ffffffffffffffff 2TB kernel logical memory map - - -Translation table lookup with 4KB pages + 3 levels: - -+--------+--------+--------+--------+--------+--------+--------+--------+ -|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| -+--------+--------+--------+--------+--------+--------+--------+--------+ - | | | | | | - | | | | | v - | | | | | [11:0] in-page offset - | | | | +-> [20:12] L3 index - | | | +-----------> [29:21] L2 index - | | +---------------------> [38:30] L1 index - | +-------------------------------> [47:39] L0 index (not used) - +-------------------------------------------------> [63] TTBR0/1 - - -Translation table lookup with 4KB pages + 4 levels: +Translation table lookup with 4KB pages: +--------+--------+--------+--------+--------+--------+--------+--------+ |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| @@ -133,7 +64,7 @@ Translation table lookup with 4KB pages + 4 levels: +-------------------------------------------------> [63] TTBR0/1 -Translation table lookup with 64KB pages + 2 levels: +Translation table lookup with 64KB pages: +--------+--------+--------+--------+--------+--------+--------+--------+ |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| @@ -142,10 +73,11 @@ Translation table lookup with 64KB pages + 2 levels: | | | | v | | | | [15:0] in-page offset | | | +----------> [28:16] L3 index - | | +--------------------------> [41:29] L2 index (only 38:29 used) - | +-------------------------------> [47:42] L1 index (not used) + | | +--------------------------> [41:29] L2 index + | +-------------------------------> [47:42] L1 index +-------------------------------------------------> [63] TTBR0/1 + When using KVM, the hypervisor maps kernel pages in EL2, at a fixed offset from the kernel VA (top 24bits of the kernel VA set to zero): diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9f862e6e9286..ec82789d03c3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -33,13 +33,16 @@ /* * VMALLOC and SPARSEMEM_VMEMMAP ranges. + * + * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array + * (rounded up to PUD_SIZE). + * VMALLOC_START: beginning of the kernel VA space + * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, + * fixed mappings and modules */ +#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) #define VMALLOC_START (UL(0xffffffffffffffff) << VA_BITS) -#if CONFIG_ARM64_PGTABLE_LEVELS != 4 -#define VMALLOC_END (PAGE_OFFSET - UL(0x400000000) - SZ_64K) -#else -#define VMALLOC_END (PAGE_OFFSET - UL(0x40000000000) - SZ_64K) -#endif +#define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7f68804814a1..0b32504e280f 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -256,26 +257,33 @@ void __init mem_init(void) #define MLK(b, t) b, t, ((t) - (b)) >> 10 #define MLM(b, t) b, t, ((t) - (b)) >> 20 +#define MLG(b, t) b, t, ((t) - (b)) >> 30 #define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K) pr_notice("Virtual kernel memory layout:\n" - " vmalloc : 0x%16lx - 0x%16lx (%6ld MB)\n" + " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" #ifdef CONFIG_SPARSEMEM_VMEMMAP - " vmemmap : 0x%16lx - 0x%16lx (%6ld MB)\n" + " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" + " 0x%16lx - 0x%16lx (%6ld MB actual)\n" #endif + " PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n" + " fixed : 0x%16lx - 0x%16lx (%6ld KB)\n" " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" " memory : 0x%16lx - 0x%16lx (%6ld MB)\n" - " .init : 0x%p" " - 0x%p" " (%6ld kB)\n" - " .text : 0x%p" " - 0x%p" " (%6ld kB)\n" - " .data : 0x%p" " - 0x%p" " (%6ld kB)\n", - MLM(VMALLOC_START, VMALLOC_END), + " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .data : 0x%p" " - 0x%p" " (%6ld KB)\n", + MLG(VMALLOC_START, VMALLOC_END), #ifdef CONFIG_SPARSEMEM_VMEMMAP + MLG((unsigned long)vmemmap, + (unsigned long)vmemmap + VMEMMAP_SIZE), MLM((unsigned long)virt_to_page(PAGE_OFFSET), (unsigned long)virt_to_page(high_memory)), #endif + MLM((unsigned long)PCI_IOBASE, (unsigned long)PCI_IOBASE + SZ_16M), + MLK(FIXADDR_START, FIXADDR_TOP), MLM(MODULES_VADDR, MODULES_END), MLM(PAGE_OFFSET, (unsigned long)high_memory), - MLK_ROUNDUP(__init_begin, __init_end), MLK_ROUNDUP(_text, _etext), MLK_ROUNDUP(_sdata, _edata)); -- cgit v1.2.3 From 86c8b27a01cf6c16fc159ade223cb2ccc70dc4b5 Mon Sep 17 00:00:00 2001 From: Leif Lindholm Date: Mon, 28 Jul 2014 19:03:03 +0100 Subject: arm64: ignore DT memreserve entries when booting in UEFI mode UEFI provides its own method for marking regions to reserve, via the memory map which is also used to initialise memblock. So when using the UEFI memory map, ignore any memreserve entries present in the DT. Reported-by: Mark Rutland Reviewed-by: Mark Rutland Acked-by: Catalin Marinas Signed-off-by: Leif Lindholm Signed-off-by: Will Deacon --- arch/arm64/kernel/efi.c | 2 ++ arch/arm64/mm/init.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/arm64/mm/init.c') diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index e72f3100958f..24f0c6fb61d8 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -188,6 +188,8 @@ static __init void reserve_regions(void) if (uefi_debug) pr_cont("\n"); } + + set_bit(EFI_MEMMAP, &efi.flags); } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5b4526ee3a01..5472c2401876 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -148,7 +149,8 @@ void __init arm64_memblock_init(void) memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start); #endif - early_init_fdt_scan_reserved_mem(); + if (!efi_enabled(EFI_MEMMAP)) + early_init_fdt_scan_reserved_mem(); /* 4GB maximum for 32-bit only capable devices */ if (IS_ENABLED(CONFIG_ZONE_DMA)) -- cgit v1.2.3 From 0ceac9e094b065fe3fec19669740f338d3480498 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 8 Sep 2014 13:01:08 -0400 Subject: efi/arm64: Fix fdt-related memory reservation Commit 86c8b27a01cf: "arm64: ignore DT memreserve entries when booting in UEFI mode prevents early_init_fdt_scan_reserved_mem() from being called for arm64 kernels booting via UEFI. This was done because the kernel will use the UEFI memory map to determine reserved memory regions. That approach has problems in that early_init_fdt_scan_reserved_mem() also reserves the FDT itself and any node-specific reserved memory. By chance of some kernel configs, the FDT may be overwritten before it can be unflattened and the kernel will fail to boot. More subtle problems will result if the FDT has node specific reserved memory which is not really reserved. This patch has the UEFI stub remove the memory reserve map entries from the FDT as it does with the memory nodes. This allows early_init_fdt_scan_reserved_mem() to be called unconditionally so that the other needed reservations are made. Signed-off-by: Mark Salter Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Matt Fleming --- arch/arm64/mm/init.c | 3 +-- drivers/firmware/efi/libstub/fdt.c | 10 +++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/arm64/mm/init.c') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5472c2401876..a83061f37e43 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -149,8 +149,7 @@ void __init arm64_memblock_init(void) memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start); #endif - if (!efi_enabled(EFI_MEMMAP)) - early_init_fdt_scan_reserved_mem(); + early_init_fdt_scan_reserved_mem(); /* 4GB maximum for 32-bit only capable devices */ if (IS_ENABLED(CONFIG_ZONE_DMA)) diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index a56bb3528755..c846a9608cbd 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -22,7 +22,7 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, unsigned long map_size, unsigned long desc_size, u32 desc_ver) { - int node, prev; + int node, prev, num_rsv; int status; u32 fdt_val32; u64 fdt_val64; @@ -73,6 +73,14 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, prev = node; } + /* + * Delete all memory reserve map entries. When booting via UEFI, + * kernel will use the UEFI memory map to find reserved regions. + */ + num_rsv = fdt_num_mem_rsv(fdt); + while (num_rsv-- > 0) + fdt_del_mem_rsv(fdt, num_rsv); + node = fdt_subnode_offset(fdt, 0, "chosen"); if (node < 0) { node = fdt_add_subnode(fdt, 0, "chosen"); -- cgit v1.2.3