summaryrefslogtreecommitdiff
path: root/arch/s390/boot
diff options
context:
space:
mode:
authorAlexander Gordeev <agordeev@linux.ibm.com>2022-12-13 11:35:11 +0100
committerHeiko Carstens <hca@linux.ibm.com>2023-01-13 14:15:05 +0100
commitbb1520d581a3a46e2d6e12bb74604ace33404de5 (patch)
treee30707db7e1375a9b4bb8bd40102e57aee8c968a /arch/s390/boot
parentbd50b7436217b4123911c2bca1efd74718654f06 (diff)
s390/mm: start kernel with DAT enabled
The setup of the kernel virtual address space is spread throughout the sources, boot stages and config options like this: 1. The available physical memory regions are queried and stored as mem_detect information for later use in the decompressor. 2. Based on the physical memory availability the virtual memory layout is established in the decompressor; 3. If CONFIG_KASAN is disabled the kernel paging setup code populates kernel pgtables and turns DAT mode on. It uses the information stored at step [1]. 4. If CONFIG_KASAN is enabled the kernel early boot kasan setup populates kernel pgtables and turns DAT mode on. It uses the information stored at step [1]. The kasan setup creates early_pg_dir directory and directly overwrites swapper_pg_dir entries to make shadow memory pages available. Move the kernel virtual memory setup to the decompressor and start the kernel with DAT turned on right from the very first istruction. That completely eliminates the boot phase when the kernel runs in DAT-off mode, simplies the overall design and consolidates pgtables setup. The identity mapping is created in the decompressor, while kasan shadow mappings are still created by the early boot kernel code. Share with decompressor the existing kasan memory allocator. It decreases the size of a newly requested memory block from pgalloc_pos and ensures that kernel image is not overwritten. pgalloc_low and pgalloc_pos pointers are made preserved boot variables for that. Use the bootdata infrastructure to setup swapper_pg_dir and invalid_pg_dir directories used by the kernel later. The interim early_pg_dir directory established by the kasan initialization code gets eliminated as result. As the kernel runs in DAT-on mode only the PSW_KERNEL_BITS define gets PSW_MASK_DAT bit by default. Additionally, the setup_lowcore_dat_off() and setup_lowcore_dat_on() routines get merged, since there is no DAT-off mode stage anymore. The memory mappings are created with RW+X protection that allows the early boot code setting up all necessary data and services for the kernel being booted. Just before the paging is enabled the memory protection is changed to RO+X for text, RO+NX for read-only data and RW+NX for kernel data and the identity mapping. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Diffstat (limited to 'arch/s390/boot')
-rw-r--r--arch/s390/boot/Makefile2
-rw-r--r--arch/s390/boot/boot.h6
-rw-r--r--arch/s390/boot/startup.c45
-rw-r--r--arch/s390/boot/vmem.c254
4 files changed, 298 insertions, 9 deletions
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index d52c3e2e16bc..47a397da0498 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -35,7 +35,7 @@ endif
CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
-obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
+obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 286441cf3bf0..547614496e7e 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -16,7 +16,7 @@ struct machine_info {
struct vmlinux_info {
unsigned long default_lma;
- void (*entry)(void);
+ unsigned long entry;
unsigned long image_size; /* does not include .bss */
unsigned long bss_size; /* uncompressed image .bss size */
unsigned long bootdata_off;
@@ -27,6 +27,9 @@ struct vmlinux_info {
unsigned long rela_dyn_start;
unsigned long rela_dyn_end;
unsigned long amode31_size;
+ unsigned long init_mm_off;
+ unsigned long swapper_pg_dir_off;
+ unsigned long invalid_pg_dir_off;
};
void startup_kernel(void);
@@ -41,6 +44,7 @@ void print_missing_facilities(void);
void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
unsigned long get_random_base(unsigned long safe_addr);
+void setup_vmem(unsigned long online_end, unsigned long asce_limit);
void __printf(1, 2) decompressor_printk(const char *fmt, ...);
void error(char *m);
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index da6ee587fe9a..c5d59df9fa62 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -11,6 +11,7 @@
#include <asm/diag.h>
#include <asm/uv.h>
#include <asm/abs_lowcore.h>
+#include <asm/mem_detect.h>
#include "decompressor.h"
#include "boot.h"
#include "uv.h"
@@ -166,9 +167,10 @@ static void setup_ident_map_size(unsigned long max_physmem_end)
#endif
}
-static void setup_kernel_memory_layout(void)
+static unsigned long setup_kernel_memory_layout(void)
{
unsigned long vmemmap_start;
+ unsigned long asce_limit;
unsigned long rte_size;
unsigned long pages;
unsigned long vmax;
@@ -183,10 +185,10 @@ static void setup_kernel_memory_layout(void)
vmalloc_size > _REGION2_SIZE ||
vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN >
_REGION2_SIZE) {
- vmax = _REGION1_SIZE;
+ asce_limit = _REGION1_SIZE;
rte_size = _REGION2_SIZE;
} else {
- vmax = _REGION2_SIZE;
+ asce_limit = _REGION2_SIZE;
rte_size = _REGION3_SIZE;
}
/*
@@ -194,7 +196,7 @@ static void setup_kernel_memory_layout(void)
* secure storage limit, so that any vmalloc allocation
* we do could be used to back secure guest storage.
*/
- vmax = adjust_to_uv_max(vmax);
+ vmax = adjust_to_uv_max(asce_limit);
#ifdef CONFIG_KASAN
/* force vmalloc and modules below kasan shadow */
vmax = min(vmax, KASAN_SHADOW_START);
@@ -223,6 +225,8 @@ static void setup_kernel_memory_layout(void)
/* make sure vmemmap doesn't overlay with vmalloc area */
VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START);
vmemmap = (struct page *)vmemmap_start;
+
+ return asce_limit;
}
/*
@@ -256,6 +260,9 @@ static void offset_vmlinux_info(unsigned long offset)
vmlinux.rela_dyn_start += offset;
vmlinux.rela_dyn_end += offset;
vmlinux.dynsym_start += offset;
+ vmlinux.init_mm_off += offset;
+ vmlinux.swapper_pg_dir_off += offset;
+ vmlinux.invalid_pg_dir_off += offset;
}
static unsigned long reserve_amode31(unsigned long safe_addr)
@@ -268,7 +275,10 @@ void startup_kernel(void)
{
unsigned long random_lma;
unsigned long safe_addr;
+ unsigned long asce_limit;
+ unsigned long online_end;
void *img;
+ psw_t psw;
detect_facilities();
@@ -290,7 +300,8 @@ void startup_kernel(void)
sanitize_prot_virt_host();
setup_ident_map_size(detect_memory());
setup_vmalloc_size();
- setup_kernel_memory_layout();
+ asce_limit = setup_kernel_memory_layout();
+ online_end = min(get_mem_detect_end(), ident_map_size);
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) {
random_lma = get_random_base(safe_addr);
@@ -307,9 +318,23 @@ void startup_kernel(void)
} else if (__kaslr_offset)
memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size);
+ /*
+ * The order of the following operations is important:
+ *
+ * - handle_relocs() must follow clear_bss_section() to establish static
+ * memory references to data in .bss to be used by setup_vmem()
+ * (i.e init_mm.pgd)
+ *
+ * - setup_vmem() must follow handle_relocs() to be able using
+ * static memory references to data in .bss (i.e init_mm.pgd)
+ *
+ * - copy_bootdata() must follow setup_vmem() to propagate changes to
+ * bootdata made by setup_vmem()
+ */
clear_bss_section();
- copy_bootdata();
handle_relocs(__kaslr_offset);
+ setup_vmem(online_end, asce_limit);
+ copy_bootdata();
if (__kaslr_offset) {
/*
@@ -321,5 +346,11 @@ void startup_kernel(void)
if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED))
memset(img, 0, vmlinux.image_size);
}
- vmlinux.entry();
+
+ /*
+ * Jump to the decompressed kernel entry point and switch DAT mode on.
+ */
+ psw.addr = vmlinux.entry;
+ psw.mask = PSW_KERNEL_BITS;
+ __load_psw(psw);
}
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
new file mode 100644
index 000000000000..db1469c17289
--- /dev/null
+++ b/arch/s390/boot/vmem.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched/task.h>
+#include <linux/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/mem_detect.h>
+#include "decompressor.h"
+#include "boot.h"
+
+#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off)
+#define swapper_pg_dir vmlinux.swapper_pg_dir_off
+#define invalid_pg_dir vmlinux.invalid_pg_dir_off
+
+unsigned long __bootdata_preserved(s390_invalid_asce);
+unsigned long __bootdata(pgalloc_pos);
+unsigned long __bootdata(pgalloc_end);
+unsigned long __bootdata(pgalloc_low);
+
+static void boot_check_oom(void)
+{
+ if (pgalloc_pos < pgalloc_low)
+ error("out of memory on boot\n");
+}
+
+static void pgtable_populate_begin(unsigned long online_end)
+{
+ unsigned long initrd_end;
+ unsigned long kernel_end;
+
+ kernel_end = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size;
+ pgalloc_low = round_up(kernel_end, PAGE_SIZE);
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
+ initrd_end = round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE);
+ pgalloc_low = max(pgalloc_low, initrd_end);
+ }
+
+ pgalloc_end = round_down(online_end, PAGE_SIZE);
+ pgalloc_pos = pgalloc_end;
+
+ boot_check_oom();
+}
+
+static void *boot_alloc_pages(unsigned int order)
+{
+ unsigned long size = PAGE_SIZE << order;
+
+ pgalloc_pos -= size;
+ pgalloc_pos = round_down(pgalloc_pos, size);
+
+ boot_check_oom();
+
+ return (void *)pgalloc_pos;
+}
+
+static void *boot_crst_alloc(unsigned long val)
+{
+ unsigned long *table;
+
+ table = boot_alloc_pages(CRST_ALLOC_ORDER);
+ if (table)
+ crst_table_init(table, val);
+ return table;
+}
+
+static pte_t *boot_pte_alloc(void)
+{
+ static void *pte_leftover;
+ pte_t *pte;
+
+ BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE);
+
+ if (!pte_leftover) {
+ pte_leftover = boot_alloc_pages(0);
+ pte = pte_leftover + _PAGE_TABLE_SIZE;
+ } else {
+ pte = pte_leftover;
+ pte_leftover = NULL;
+ }
+ memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+ return pte;
+}
+
+static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end)
+{
+ return machine.has_edat2 &&
+ IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE;
+}
+
+static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end)
+{
+ return machine.has_edat1 &&
+ IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE;
+}
+
+static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pte_t *pte, entry;
+
+ pte = pte_offset_kernel(pmd, addr);
+ for (; addr < end; addr += PAGE_SIZE, pte++) {
+ if (pte_none(*pte)) {
+ entry = __pte(__pa(addr));
+ entry = set_pte_bit(entry, PAGE_KERNEL_EXEC);
+ set_pte(pte, entry);
+ }
+ }
+}
+
+static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd, entry;
+ pte_t *pte;
+
+ pmd = pmd_offset(pud, addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(*pmd)) {
+ if (can_large_pmd(pmd, addr, next)) {
+ entry = __pmd(__pa(addr));
+ entry = set_pmd_bit(entry, SEGMENT_KERNEL_EXEC);
+ set_pmd(pmd, entry);
+ continue;
+ }
+ pte = boot_pte_alloc();
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (pmd_large(*pmd)) {
+ continue;
+ }
+ pgtable_pte_populate(pmd, addr, next);
+ }
+}
+
+static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pud_t *pud, entry;
+ pmd_t *pmd;
+
+ pud = pud_offset(p4d, addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+ if (pud_none(*pud)) {
+ if (can_large_pud(pud, addr, next)) {
+ entry = __pud(__pa(addr));
+ entry = set_pud_bit(entry, REGION3_KERNEL_EXEC);
+ set_pud(pud, entry);
+ continue;
+ }
+ pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ pud_populate(&init_mm, pud, pmd);
+ } else if (pud_large(*pud)) {
+ continue;
+ }
+ pgtable_pmd_populate(pud, addr, next);
+ }
+}
+
+static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = p4d_offset(pgd, addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(*p4d)) {
+ pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY);
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ pgtable_pud_populate(p4d, addr, next);
+ }
+}
+
+static void pgtable_populate(unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ pgd = pgd_offset(&init_mm, addr);
+ for (; addr < end; addr = next, pgd++) {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(*pgd)) {
+ p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY);
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ pgtable_p4d_populate(pgd, addr, next);
+ }
+}
+
+/*
+ * The pgtables are located in the range [pgalloc_pos, pgalloc_end).
+ * That range must stay intact and is later reserved in the memblock.
+ * Therefore pgtable_populate(pgalloc_pos, pgalloc_end) is needed to
+ * finalize pgalloc_pos pointer. However that call can decrease the
+ * value of pgalloc_pos pointer itself. Therefore, pgtable_populate()
+ * needs to be called repeatedly until pgtables are complete and
+ * pgalloc_pos does not grow left anymore.
+ */
+static void pgtable_populate_end(void)
+{
+ unsigned long pgalloc_end_curr = pgalloc_end;
+ unsigned long pgalloc_pos_prev;
+
+ do {
+ pgalloc_pos_prev = pgalloc_pos;
+ pgtable_populate(pgalloc_pos, pgalloc_end_curr);
+ pgalloc_end_curr = pgalloc_pos_prev;
+ } while (pgalloc_pos < pgalloc_pos_prev);
+}
+
+void setup_vmem(unsigned long online_end, unsigned long asce_limit)
+{
+ unsigned long asce_type;
+ unsigned long asce_bits;
+
+ if (asce_limit == _REGION1_SIZE) {
+ asce_type = _REGION2_ENTRY_EMPTY;
+ asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ } else {
+ asce_type = _REGION3_ENTRY_EMPTY;
+ asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ }
+ s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+
+ crst_table_init((unsigned long *)swapper_pg_dir, asce_type);
+ crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
+
+ /*
+ * To allow prefixing the lowcore must be mapped with 4KB pages.
+ * To prevent creation of a large page at address 0 first map
+ * the lowcore and create the identity mapping only afterwards.
+ *
+ * No further pgtable_populate() calls are allowed after the value
+ * of pgalloc_pos finalized with a call to pgtable_populate_end().
+ */
+ pgtable_populate_begin(online_end);
+ pgtable_populate(0, sizeof(struct lowcore));
+ pgtable_populate(0, online_end);
+ pgtable_populate_end();
+
+ S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits;
+ S390_lowcore.user_asce = s390_invalid_asce;
+
+ __ctl_load(S390_lowcore.kernel_asce, 1, 1);
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
+ __ctl_load(S390_lowcore.kernel_asce, 13, 13);
+
+ init_mm.context.asce = S390_lowcore.kernel_asce;
+}