From 3c61df3885e91f8737bbbbaba79b908da0e1919f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:48 -0700 Subject: kcov: cleanup debug messages Patch series "kcov: collect coverage from usb soft interrupts", v4. This patchset extends kcov to allow collecting coverage from soft interrupts and then uses the new functionality to collect coverage from USB code. This has allowed to find at least one new HID bug [1], which was recently fixed by Alan [2]. [1] https://syzkaller.appspot.com/bug?extid=09ef48aa58261464b621 [2] https://patchwork.kernel.org/patch/11283319/ Any subsystem that uses softirqs (e.g. timers) can make use of this in the future. Looking at the recent syzbot reports, an obvious candidate is the networking subsystem [3, 4, 5 and many more]. [3] https://syzkaller.appspot.com/bug?extid=522ab502c69badc66ab7 [4] https://syzkaller.appspot.com/bug?extid=57f89d05946c53dbbb31 [5] https://syzkaller.appspot.com/bug?extid=df358e65d9c1b9d3f5f4 This pach (of 7): Previous commit left a lot of excessive debug messages, clean them up. Link; http://lkml.kernel.org/r/cover.1585233617.git.andreyknvl@google.com Link; http://lkml.kernel.org/r/ab5e2885ce674ba6e04368551e51eeb6a2c11baf.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Alexander Potapenko Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/4a497134b2cf7a9d306d28e3dd2746f5446d1605.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..e6bb2b50569f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -98,6 +98,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +120,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +134,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -366,7 +363,6 @@ static void kcov_remote_reset(struct kcov *kcov) hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } @@ -553,7 +549,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +567,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -598,7 +592,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +603,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -629,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->remote_size = remote_arg->area_size; spin_lock(&kcov_remote_lock); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { spin_unlock(&kcov_remote_lock); @@ -644,8 +635,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { spin_unlock(&kcov_remote_lock); @@ -782,7 +771,6 @@ void kcov_remote_start(u64 handle) spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); spin_unlock(&kcov_remote_lock); return; } @@ -810,8 +798,6 @@ void kcov_remote_start(u64 handle) /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); - kcov_start(t, size, area, mode, sequence); } @@ -881,10 +867,8 @@ void kcov_remote_stop(void) unsigned int size = t->kcov_size; int sequence = t->kcov_sequence; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!kcov) return; - } kcov_stop(t); t->kcov = NULL; @@ -894,8 +878,6 @@ void kcov_remote_stop(void) * KCOV_DISABLE could have been called between kcov_remote_start() * and kcov_remote_stop(), hence the check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); -- cgit v1.2.3 From 67b3d3cca385507c4c8b6ad97b823415e038e3c8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:51 -0700 Subject: kcov: fix potential use-after-free in kcov_remote_start If vmalloc() fails in kcov_remote_start() we'll access remote->kcov without holding kcov_remote_lock, so remote might potentially be freed at that point. Cache kcov pointer in a local variable. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/9d9134359725a965627b7e8f2652069f86f1d1fa.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/de0d3d30ff90776a2a509cc34c7c1c7521bda125.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index e6bb2b50569f..14e7208c5291 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -748,6 +748,7 @@ static const struct file_operations kcov_fops = { void kcov_remote_start(u64 handle) { struct kcov_remote *remote; + struct kcov *kcov; void *area; struct task_struct *t; unsigned int size; @@ -774,16 +775,17 @@ void kcov_remote_start(u64 handle) spin_unlock(&kcov_remote_lock); return; } + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); + t->kcov = kcov; /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; + size = kcov->remote_size; + mode = kcov->mode; + sequence = kcov->sequence; area = kcov_remote_area_get(size); spin_unlock(&kcov_remote_lock); @@ -791,7 +793,7 @@ void kcov_remote_start(u64 handle) area = vmalloc(size * sizeof(unsigned long)); if (!area) { t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } -- cgit v1.2.3 From 76484b1c77242b737f8fd001d6e00af7518221f3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:55 -0700 Subject: kcov: move t->kcov assignments into kcov_start/stop Every time kcov_start/stop() is called, t->kcov is also assigned, so move the assignment into the functions. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/6644839d3567df61ade3c4b246a46cacbe4f9e11.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/82625ef3ff878f0b585763cc31d09d9b08ca37d6.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 14e7208c5291..96dbc198d166 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -309,10 +309,12 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; @@ -326,6 +328,7 @@ static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -333,7 +336,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -584,9 +586,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); @@ -778,7 +779,6 @@ void kcov_remote_start(u64 handle) kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); - t->kcov = kcov; /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). @@ -792,7 +792,6 @@ void kcov_remote_start(u64 handle) if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; kcov_put(kcov); return; } @@ -800,7 +799,7 @@ void kcov_remote_start(u64 handle) /* Reset coverage size. */ *(u64 *)area = 0; - kcov_start(t, size, area, mode, sequence); + kcov_start(t, kcov, size, area, mode, sequence); } EXPORT_SYMBOL(kcov_remote_start); @@ -873,7 +872,6 @@ void kcov_remote_stop(void) return; kcov_stop(t); - t->kcov = NULL; spin_lock(&kcov->lock); /* -- cgit v1.2.3 From eeb91f9a2e3e9766ae9fd1117bd19d87538f21bf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:45:58 -0700 Subject: kcov: move t->kcov_sequence assignment Move t->kcov_sequence assignment before assigning t->kcov_mode for consistency. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/5889efe35e0b300e69dba97216b1288d9c2428a8.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/f0283c676bab3335cb48bfe12d375a3da4719f59.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 96dbc198d166..7cd05bd1fada 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -318,10 +318,10 @@ static void kcov_start(struct task_struct *t, struct kcov *kcov, /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) -- cgit v1.2.3 From 5fe7042dc0a2e80b4633df20dcd06b93e76e3c31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:01 -0700 Subject: kcov: use t->kcov_mode as enabled indicator Currently kcov_remote_start() and kcov_remote_stop() check t->kcov to find out whether the coverage is already being collected by the current task. Use t->kcov_mode for that instead. This doesn't change the overall behavior in any way, but serves as a preparation for the following softirq coverage collection support patch. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Andrey Konovalov Link: http://lkml.kernel.org/r/f70377945d1d8e6e4916cbce871a12303d6186b4.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/ee1a1dec43059da5d7664c85c1addc89c4cd58de.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 7cd05bd1fada..93b28ad2da28 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -746,26 +746,33 @@ static const struct file_operations kcov_fops = { * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; if (WARN_ON(!in_task())) return; - t = current; + /* * Check that kcov_remote_start is not called twice * nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(kcov_mode_enabled(mode))) return; kcov_debug("handle = %llx\n", handle); @@ -863,13 +870,20 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; - if (!kcov) + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) return; + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; kcov_stop(t); -- cgit v1.2.3 From 5ff3b30ab57da82d8db4f14662a2858cabfbc2c0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:04 -0700 Subject: kcov: collect coverage from interrupts This change extends kcov remote coverage support to allow collecting coverage from soft interrupts in addition to kernel background threads. To collect coverage from code that is executed in softirq context, a part of that code has to be annotated with kcov_remote_start/stop() in a similar way as how it is done for global kernel background threads. Then the handle used for the annotations has to be passed to the KCOV_REMOTE_ENABLE ioctl. Internally this patch adjusts the __sanitizer_cov_trace_pc() compiler inserted callback to not bail out when called from softirq context. kcov_remote_start/stop() are updated to save/restore the current per task kcov state in a per-cpu area (in case the softirq came when the kernel was already collecting coverage in task context). Coverage from softirqs is collected into pre-allocated per-cpu areas, whose size is controlled by the new CONFIG_KCOV_IRQ_AREA_SIZE. [andreyknvl@google.com: turn current->kcov_softirq into unsigned int to fix objtool warning] Link: http://lkml.kernel.org/r/841c778aa3849c5cb8c3761f56b87ce653a88671.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Greg Kroah-Hartman Cc: Marco Elver Link: http://lkml.kernel.org/r/469bd385c431d050bc38a593296eff4baae50666.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kcov.rst | 17 ++-- include/linux/sched.h | 3 + kernel/kcov.c | 194 +++++++++++++++++++++++++++++++-------- lib/Kconfig.debug | 9 ++ 4 files changed, 176 insertions(+), 47 deletions(-) diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 1c4e1825d769..8548b0b04e43 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -217,14 +217,15 @@ This allows to collect coverage from two types of kernel background threads: the global ones, that are spawned during kernel boot in a limited number of instances (e.g. one USB hub_event() worker thread is spawned per USB HCD); and the local ones, that are spawned when a user interacts with -some kernel interface (e.g. vhost workers). +some kernel interface (e.g. vhost workers); as well as from soft +interrupts. -To enable collecting coverage from a global background thread, a unique -global handle must be assigned and passed to the corresponding -kcov_remote_start() call. Then a userspace process can pass a list of such -handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the -kcov_remote_arg struct. This will attach the used kcov device to the code -sections, that are referenced by those handles. +To enable collecting coverage from a global background thread or from a +softirq, a unique global handle must be assigned and passed to the +corresponding kcov_remote_start() call. Then a userspace process can pass +a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles +array field of the kcov_remote_arg struct. This will attach the used kcov +device to the code sections, that are referenced by those handles. Since there might be many local background threads spawned from different userspace processes, we can't use a single global handle per annotation. @@ -242,7 +243,7 @@ handles as they don't belong to a particular subsystem. The bytes 4-7 are currently reserved and must be zero. In the future the number of bytes used for the subsystem or handle ids might be increased. -When a particular userspace proccess collects coverage by via a common +When a particular userspace proccess collects coverage via a common handle, kcov will collect coverage for each code section that is annotated to use the common handle obtained as kcov_handle from the current task_struct. However non common handles allow to collect coverage diff --git a/include/linux/sched.h b/include/linux/sched.h index 57a5ce9f33c5..c5d96e3e7fff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,6 +1247,9 @@ struct task_struct { /* KCOV sequence number: */ int kcov_sequence; + + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG diff --git a/kernel/kcov.c b/kernel/kcov.c index 93b28ad2da28..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -145,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -360,8 +373,9 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; @@ -370,7 +384,7 @@ static void kcov_remote_reset(struct kcov *kcov) } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -399,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -428,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -444,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -459,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -468,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -548,6 +564,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: @@ -620,17 +637,19 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } @@ -638,20 +657,22 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (remote_arg->common_handle) { if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -667,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -687,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -706,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -718,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -739,7 +761,7 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() @@ -752,6 +774,39 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { struct task_struct *t = current; @@ -761,28 +816,42 @@ void kcov_remote_start(u64 handle) void *area; unsigned int size; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ mode = READ_ONCE(t->kcov_mode); - if (WARN_ON(kcov_mode_enabled(mode))) + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); @@ -790,12 +859,18 @@ void kcov_remote_start(u64 handle) * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = kcov->remote_size; mode = kcov->mode; sequence = kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { @@ -803,11 +878,20 @@ void kcov_remote_start(u64 handle) return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } kcov_start(t, kcov, size, area, mode, sequence); + local_irq_restore(flags); + } EXPORT_SYMBOL(kcov_remote_start); @@ -875,31 +959,53 @@ void kcov_remote_stop(void) void *area; unsigned int size; int sequence; + unsigned long flags; + + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); mode = READ_ONCE(t->kcov_mode); barrier(); - if (!kcov_mode_enabled(mode)) + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); return; + } kcov = t->kcov; area = t->kcov_area; size = t->kcov_size; sequence = t->kcov_sequence; + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } + kcov_stop(t); + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + local_irq_restore(flags); + + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -913,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0217ed126f77..d07ab3e056cd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1774,6 +1774,15 @@ config KCOV_INSTRUMENT_ALL filesystem fuzzing with AFL) then you will want to enable coverage for more specific subsets of files, and should say n here. +config KCOV_IRQ_AREA_SIZE + hex "Size of interrupt coverage collection area in words" + depends on KCOV + default 0x40000 + help + KCOV uses preallocated per-cpu areas to collect coverage from + soft interrupts. This specifies the size of those areas in the + number of unsigned long words. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" def_bool y -- cgit v1.2.3 From 76e278d6b50534092038d4117d7a2687af034107 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:08 -0700 Subject: usb: core: kcov: collect coverage from usb complete callback This patch adds kcov_remote_start/stop() callbacks around the urb complete() callback that is executed in softirq context when dummy_hcd is in use. As the result, kcov can be used to collect coverage from those callbacks, which is used to facilitate coverage-guided fuzzing with syzkaller. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Greg Kroah-Hartman Cc: Marco Elver Link: http://lkml.kernel.org/r/4520671eeb604adbc2432c248b0c07fbaa5519ef.1585233617.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/2821d497ac1cdc0efb5e00df30271e4a67fc8009.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- drivers/usb/core/hcd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index aa45840d8273..de624c47e190 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -1645,7 +1646,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; + kcov_remote_start_usb((u64)urb->dev->bus->busnum); urb->complete(urb); + kcov_remote_stop(); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); -- cgit v1.2.3 From c571686a92ffd30d9f6092ce3f697e125bf96fd5 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 4 Jun 2020 16:46:11 -0700 Subject: mm/util.c: remove the VM_WARN_ONCE for vm_committed_as underflow check This check was added by commit 82f71ae4a2b8 ("mm: catch memory commitment underflow") in 2014 to have a safety check for issues which have been fixed. And there has been few report caught by it, as described in its commit log: : This shouldn't happen any more - the previous two patches fixed : the committed_as underflow issues. But it was really found by Qian Cai when he used the LTP memory stress suite to test a RFC patchset, which tries to improve scalability of per-cpu counter 'vm_committed_as', by chosing a bigger 'batch' number for loose overcommit policies (OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS), while keeping current number for OVERCOMMIT_NEVER. With that patchset, when system firstly uses a loose policy, the 'vm_committed_as' count could be a big negative value, as its big 'batch' number allows a big deviation, then when the policy is changed to OVERCOMMIT_NEVER, the 'batch' will be decreased to a much smaller value, thus hits this WARN check. To mitigate this, one proposed solution is to queue work on all online CPUs to do a local sync for 'vm_committed_as' when changing policy to OVERCOMMIT_NEVER, plus some global syncing to garante the case won't be hit. But this solution is costy and slow, given this check hasn't shown real trouble or benefit, simply drop it from one hot path of MM. And perf stats does show some tiny saving for removing it. Reported-by: Qian Cai Signed-off-by: Feng Tang Signed-off-by: Andrew Morton Reviewed-by: Qian Cai Acked-by: Michal Hocko Cc: Konstantin Khlebnikov Cc: Andi Kleen Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mel Gorman Cc: Kees Cook Link: http://lkml.kernel.org/r/20200603094804.GB89848@shbuild999.sh.intel.com Signed-off-by: Linus Torvalds --- mm/util.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/util.c b/mm/util.c index 41b47d8cae09..fd9efe6bd463 100644 --- a/mm/util.c +++ b/mm/util.c @@ -796,10 +796,6 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - vm_acct_memory(pages); /* -- cgit v1.2.3 From f426f4edf46c1a188edc33c25c13728857da3152 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:15 -0700 Subject: h8300: remove usage of __ARCH_USE_5LEVEL_HACK Patch series "mm: remove __ARCH_HAS_5LEVEL_HACK", v4. These patches convert several architectures to use page table folding and remove __ARCH_HAS_5LEVEL_HACK along with include/asm-generic/5level-fixup.h and include/asm-generic/pgtable-nop4d-hack.h. With that we'll have a single and consistent way of dealing with page table folding instead of a mix of three existing options. The changes are mostly about mechanical replacement of pgd accessors with p4d ones and the addition of higher levels to page table traversals. This patch (of 14): h8300 is a nommu architecture and does not require fixup for upper layers of the page tables because it is already handled by the generic nommu implementation. Remove definition of __ARCH_USE_5LEVEL_HACK in arch/h8300/include/asm/pgtable.h Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne [openrisc] Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Cc: Joerg Roedel Link: http://lkml.kernel.org/r/20200414153455.21744-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200414153455.21744-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/h8300/include/asm/pgtable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index 4d00152fab58..f00828720dc4 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _H8300_PGTABLE_H #define _H8300_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #include extern void paging_init(void); -- cgit v1.2.3 From 84e6ffb2c49c7901a9efb54b497d2eb84c3bef8c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:19 -0700 Subject: arm: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate, and remove __ARCH_USE_5LEVEL_HACK. [rppt@linux.ibm.com: fix kexec] Link: http://lkml.kernel.org/r/20200508174232.GA759899@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Marek Szyprowski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgtable.h | 1 - arch/arm/lib/uaccess_with_memcpy.c | 7 ++++++- arch/arm/mach-sa1100/assabet.c | 2 +- arch/arm/mm/dump.c | 29 +++++++++++++++++++++------ arch/arm/mm/fault-armv.c | 7 ++++++- arch/arm/mm/fault.c | 22 +++++++++++++-------- arch/arm/mm/idmap.c | 3 ++- arch/arm/mm/init.c | 2 +- arch/arm/mm/ioremap.c | 12 +++++++++--- arch/arm/mm/mm.h | 2 +- arch/arm/mm/mmu.c | 35 +++++++++++++++++++++++++++------ arch/arm/mm/pgd.c | 40 +++++++++++++++++++++++++++++++------- 12 files changed, 125 insertions(+), 37 deletions(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index befc8fcec98f..fba20607c53c 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -17,7 +17,6 @@ #else -#define __ARCH_USE_5LEVEL_HACK #include #include #include diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index c9450982a155..d72b14c96670 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -24,6 +24,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; + p4d_t *p4d; pmd_t *pmd; pte_t *pte; pud_t *pud; @@ -33,7 +34,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d))) + return 0; + + pud = pud_offset(p4d, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) return 0; diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c index d96a101e5504..0631a7b02678 100644 --- a/arch/arm/mach-sa1100/assabet.c +++ b/arch/arm/mach-sa1100/assabet.c @@ -633,7 +633,7 @@ static void __init map_sa1100_gpio_regs( void ) int prot = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_DOMAIN(DOMAIN_IO); pmd_t *pmd; - pmd = pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), virt), virt); *pmd = __pmd(phys | prot); flush_pmd_entry(pmd); } diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 7d6291f23251..677549d6854c 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -207,6 +207,7 @@ struct pg_level { static struct pg_level pg_level[] = { { }, { /* pgd */ + }, { /* p4d */ }, { /* pud */ }, { /* pmd */ .bits = section_bits, @@ -308,7 +309,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start, for (i = 0; i < PTRS_PER_PTE; i++, pte++) { addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), domain); + note_page(st, addr, 5, pte_val(*pte), domain); } } @@ -350,14 +351,14 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) addr += SECTION_SIZE; pmd++; domain = get_domain_name(pmd); - note_page(st, addr, 3, pmd_val(*pmd), domain); + note_page(st, addr, 4, pmd_val(*pmd), domain); } } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned i; @@ -366,7 +367,23 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) if (!pud_none(*pud)) { walk_pmd(st, pud, addr); } else { - note_page(st, addr, 2, pud_val(*pud), NULL); + note_page(st, addr, 3, pud_val(*pud), NULL); + } + } +} + +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) { + walk_pud(st, p4d, addr); + } else { + note_page(st, addr, 2, p4d_val(*p4d), NULL); } } } @@ -381,7 +398,7 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { addr = start + i * PGDIR_SIZE; if (!pgd_none(*pgd)) { - walk_pud(st, pgd, addr); + walk_p4d(st, pgd, addr); } else { note_page(st, addr, 1, pgd_val(*pgd), NULL); } diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index ae857f41f68d..489aaafa6ebd 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -91,6 +91,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, { spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -100,7 +101,11 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (pgd_none_or_clear_bad(pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none_or_clear_bad(p4d)) + return 0; + + pud = pud_offset(p4d, address); if (pud_none_or_clear_bad(pud)) return 0; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2dd5c41cbb8d..ff230e9affc4 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -43,19 +43,21 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) break; - if (pgd_bad(*pgd)) { + if (p4d_bad(*p4d)) { pr_cont("(bad)"); break; } - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) pr_cont(", *pud=%08llx", (long long)pud_val(*pud)); @@ -405,6 +407,7 @@ do_translation_fault(unsigned long addr, unsigned int fsr, { unsigned int index; pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -419,13 +422,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, pgd = cpu_get_pgd() + index; pgd_k = init_mm.pgd + index; - if (pgd_none(*pgd_k)) + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + + if (p4d_none(*p4d_k)) goto bad_area; - if (!pgd_present(*pgd)) - set_pgd(pgd, *pgd_k); + if (!p4d_present(*p4d)) + set_p4d(p4d, *p4d_k); - pud = pud_offset(pgd, addr); - pud_k = pud_offset(pgd_k, addr); + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); if (pud_none(*pud_k)) goto bad_area; diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index a033f6134a64..cd54411ef1b8 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -68,7 +68,8 @@ static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, static void idmap_add_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long prot) { - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 4e43455fab84..01e18e43b174 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -519,7 +519,7 @@ static inline void section_update(unsigned long addr, pmdval_t mask, { pmd_t *pmd; - pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, addr), addr), addr), addr); #ifdef CONFIG_ARM_LPAE pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot); diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 72286f9a4d30..75529d76d28c 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -142,12 +142,14 @@ static void unmap_area_sections(unsigned long virt, unsigned long size) { unsigned long addr = virt, end = virt + (size & ~(SZ_1M - 1)); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmdp; flush_cache_vunmap(addr, end); pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmdp = pmd_offset(pud, addr); do { pmd_t pmd = *pmdp; @@ -190,6 +192,7 @@ remap_area_sections(unsigned long virt, unsigned long pfn, { unsigned long addr = virt, end = virt + size; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -200,7 +203,8 @@ remap_area_sections(unsigned long virt, unsigned long pfn, unmap_area_sections(virt, size); pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmd = pmd_offset(pud, addr); do { pmd[0] = __pmd(__pfn_to_phys(pfn) | type->prot_sect); @@ -222,6 +226,7 @@ remap_area_supersections(unsigned long virt, unsigned long pfn, { unsigned long addr = virt, end = virt + size; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -232,7 +237,8 @@ remap_area_supersections(unsigned long virt, unsigned long pfn, unmap_area_sections(virt, size); pgd = pgd_offset_k(virt); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmd = pmd_offset(pud, addr); do { unsigned long super_pmd_val, i; diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 88c121ac14b3..4f1f72b75890 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -38,7 +38,7 @@ static inline pte_t get_top_pte(unsigned long va) static inline pmd_t *pmd_off_k(unsigned long virt) { - return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); + return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), virt), virt); } struct mem_type { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index ec8d0008bfa1..c425288f1a86 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -357,7 +357,8 @@ static pte_t *pte_offset_late_fixmap(pmd_t *dir, unsigned long addr) static inline pmd_t * __init fixmap_pmd(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; @@ -801,12 +802,12 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, +static void __init alloc_init_pud(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys, const struct mem_type *type, void *(*alloc)(unsigned long sz), bool ng) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { @@ -816,6 +817,21 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } +static void __init alloc_init_p4d(pgd_t *pgd, unsigned long addr, + unsigned long end, phys_addr_t phys, + const struct mem_type *type, + void *(*alloc)(unsigned long sz), bool ng) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + alloc_init_pud(p4d, addr, next, phys, type, alloc, ng); + phys += next - addr; + } while (p4d++, addr = next, addr != end); +} + #ifndef CONFIG_ARM_LPAE static void __init create_36bit_mapping(struct mm_struct *mm, struct map_desc *md, @@ -863,7 +879,8 @@ static void __init create_36bit_mapping(struct mm_struct *mm, pgd = pgd_offset(mm, addr); end = addr + length; do { - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); int i; @@ -914,7 +931,7 @@ static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md, do { unsigned long next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys, type, alloc, ng); + alloc_init_p4d(pgd, addr, next, phys, type, alloc, ng); phys += next - addr; addr = next; @@ -950,7 +967,13 @@ void __init create_mapping_late(struct mm_struct *mm, struct map_desc *md, bool ng) { #ifdef CONFIG_ARM_LPAE - pud_t *pud = pud_alloc(mm, pgd_offset(mm, md->virtual), md->virtual); + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_alloc(mm, pgd_offset(mm, md->virtual), md->virtual); + if (!WARN_ON(!p4d)) + return; + pud = pud_alloc(mm, p4d, md->virtual); if (WARN_ON(!pud)) return; pmd_alloc(mm, pud, 0); diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 478bd2c6aa50..c5e1b27046a8 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -30,6 +30,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd, *init_pgd; + p4d_t *new_p4d, *init_p4d; pud_t *new_pud, *init_pud; pmd_t *new_pmd, *init_pmd; pte_t *new_pte, *init_pte; @@ -53,8 +54,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) /* * Allocate PMD table for modules and pkmap mappings. */ - new_pud = pud_alloc(mm, new_pgd + pgd_index(MODULES_VADDR), + new_p4d = p4d_alloc(mm, new_pgd + pgd_index(MODULES_VADDR), MODULES_VADDR); + if (!new_p4d) + goto no_p4d; + + new_pud = pud_alloc(mm, new_p4d, MODULES_VADDR); if (!new_pud) goto no_pud; @@ -69,7 +74,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * contains the machine vectors. The vectors are always high * with LPAE. */ - new_pud = pud_alloc(mm, new_pgd, 0); + new_p4d = p4d_alloc(mm, new_pgd, 0); + if (!new_p4d) + goto no_p4d; + + new_pud = pud_alloc(mm, new_p4d, 0); if (!new_pud) goto no_pud; @@ -91,7 +100,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_val(*new_pmd) |= PMD_DOMAIN(DOMAIN_VECTORS); #endif - init_pud = pud_offset(init_pgd, 0); + init_p4d = p4d_offset(init_pgd, 0); + init_pud = pud_offset(init_p4d, 0); init_pmd = pmd_offset(init_pud, 0); init_pte = pte_offset_map(init_pmd, 0); set_pte_ext(new_pte + 0, init_pte[0], 0); @@ -108,6 +118,8 @@ no_pte: no_pmd: pud_free(mm, new_pud); no_pud: + p4d_free(mm, new_p4d); +no_p4d: __pgd_free(new_pgd); no_pgd: return NULL; @@ -116,6 +128,7 @@ no_pgd: void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgtable_t pte; @@ -127,7 +140,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) if (pgd_none_or_clear_bad(pgd)) goto no_pgd; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + if (p4d_none_or_clear_bad(p4d)) + goto no_p4d; + + pud = pud_offset(p4d, 0); if (pud_none_or_clear_bad(pud)) goto no_pud; @@ -144,8 +161,11 @@ no_pmd: pmd_free(mm, pmd); mm_dec_nr_pmds(mm); no_pud: - pgd_clear(pgd); + p4d_clear(p4d); pud_free(mm, pud); +no_p4d: + pgd_clear(pgd); + p4d_free(mm, p4d); no_pgd: #ifdef CONFIG_ARM_LPAE /* @@ -156,15 +176,21 @@ no_pgd: continue; if (pgd_val(*pgd) & L_PGD_SWAPPER) continue; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + if (p4d_none_or_clear_bad(p4d)) + continue; + pud = pud_offset(p4d, 0); if (pud_none_or_clear_bad(pud)) continue; pmd = pmd_offset(pud, 0); pud_clear(pud); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); - pgd_clear(pgd); + p4d_clear(p4d); pud_free(mm, pud); + mm_dec_nr_puds(mm); + pgd_clear(pgd); + p4d_free(mm, p4d); } #endif __pgd_free(pgd_base); -- cgit v1.2.3 From e9f6376858b9799148d07e58b72b681d4b8fa4c7 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:23 -0700 Subject: arm64: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate, replace 5level-fixup.h with pgtable-nop4d.h and remove __ARCH_USE_5LEVEL_HACK. [arnd@arndb.de: fix gcc-10 shift warning] Link: http://lkml.kernel.org/r/20200429185657.4085975-1-arnd@arndb.de Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-4-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/kvm_mmu.h | 10 +- arch/arm64/include/asm/pgalloc.h | 10 +- arch/arm64/include/asm/pgtable-types.h | 5 +- arch/arm64/include/asm/pgtable.h | 37 +++--- arch/arm64/include/asm/stage2_pgtable.h | 48 ++++++-- arch/arm64/kernel/hibernate.c | 44 +++++-- arch/arm64/kvm/mmu.c | 209 +++++++++++++++++++++++++++----- arch/arm64/mm/fault.c | 9 +- arch/arm64/mm/hugetlbpage.c | 15 ++- arch/arm64/mm/kasan_init.c | 26 ++-- arch/arm64/mm/mmu.c | 52 +++++--- arch/arm64/mm/pageattr.c | 7 +- 12 files changed, 368 insertions(+), 104 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 324c8483d2b9..f1a74163d764 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -172,8 +172,8 @@ void kvm_clear_hyp_idmap(void); __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) #define kvm_mk_pud(pmdp) \ __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) -#define kvm_mk_pgd(pudp) \ - __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_mk_p4d(pmdp) \ + __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE) #define kvm_set_pud(pudp, pud) set_pud(pudp, pud) @@ -299,6 +299,12 @@ static inline bool kvm_s2pud_young(pud_t pud) #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) #endif +#ifdef __PAGETABLE_P4D_FOLDED +#define hyp_p4d_table_empty(p4dp) (0) +#else +#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp) +#endif + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 172d76fa0245..58e93583ddb6 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -73,17 +73,17 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pudp) free_page((unsigned long)pudp); } -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) +static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { - set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot)); + set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE); + __p4d_populate(p4dp, __pa(pudp), PUD_TYPE_TABLE); } #else -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) +static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { BUILD_BUG(); } diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index acb0751a6606..b8f158ae2527 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -14,6 +14,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; /* @@ -44,13 +45,11 @@ typedef struct { pteval_t pgprot; } pgprot_t; #define __pgprot(x) ((pgprot_t) { (x) } ) #if CONFIG_PGTABLE_LEVELS == 2 -#define __ARCH_USE_5LEVEL_HACK #include #elif CONFIG_PGTABLE_LEVELS == 3 -#define __ARCH_USE_5LEVEL_HACK #include #elif CONFIG_PGTABLE_LEVELS == 4 -#include +#include #endif #endif /* __ASM_PGTABLE_TYPES_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9ce000f22d9e..1f3218fc52fc 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -298,6 +298,11 @@ static inline pte_t pgd_pte(pgd_t pgd) return __pte(pgd_val(pgd)); } +static inline pte_t p4d_pte(p4d_t p4d) +{ + return __pte(p4d_val(p4d)); +} + static inline pte_t pud_pte(pud_t pud) { return __pte(pud_val(pud)); @@ -401,6 +406,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) +#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) +#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) + #define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd)) #define __phys_to_pgd_val(phys) __phys_to_pte_val(phys) @@ -592,49 +600,50 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (!(pgd_val(pgd) & 2)) -#define pgd_present(pgd) (pgd_val(pgd)) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (!(p4d_val(p4d) & 2)) +#define p4d_present(p4d) (p4d_val(p4d)) -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { - if (in_swapper_pgdir(pgdp)) { - set_swapper_pgd(pgdp, pgd); + if (in_swapper_pgdir(p4dp)) { + set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d))); return; } - WRITE_ONCE(*pgdp, pgd); + WRITE_ONCE(*p4dp, p4d); dsb(ishst); isb(); } -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - set_pgd(pgdp, __pgd(0)); + set_p4d(p4dp, __p4d(0)); } -static inline phys_addr_t pgd_page_paddr(pgd_t pgd) +static inline phys_addr_t p4d_page_paddr(p4d_t p4d) { - return __pgd_to_phys(pgd); + return __p4d_to_phys(p4d); } /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -#define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset_phys(dir, addr) (p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) -#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) +#define pud_set_fixmap_offset(p4d, addr) pud_set_fixmap(pud_offset_phys(p4d, addr)) #define pud_clear_fixmap() clear_fixmap(FIX_PUD) -#define pgd_page(pgd) phys_to_page(__pgd_to_phys(pgd)) +#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d))) /* use ONLY for statically allocated translation tables */ #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) #else +#define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;}) #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) /* Match pud_offset folding in */ diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 326aac658b9d..9a364aeae5fb 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -68,41 +68,67 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm) #define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d) + +static inline p4d_t *stage2_p4d_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + return p4d_offset(pgd, address); +} + +static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d) +{ +} + +static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp) +{ + return false; +} + +static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm, + phys_addr_t addr, phys_addr_t end) +{ + return end; +} + +static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d) { if (kvm_stage2_has_pud(kvm)) - return pgd_none(pgd); + return p4d_none(p4d); else return 0; } -static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp) { if (kvm_stage2_has_pud(kvm)) - pgd_clear(pgdp); + p4d_clear(p4dp); } -static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) +static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d) { if (kvm_stage2_has_pud(kvm)) - return pgd_present(pgd); + return p4d_present(p4d); else return 1; } -static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud) { if (kvm_stage2_has_pud(kvm)) - pgd_populate(NULL, pgd, pud); + p4d_populate(NULL, p4d, pud); } static inline pud_t *stage2_pud_offset(struct kvm *kvm, - pgd_t *pgd, unsigned long address) + p4d_t *p4d, unsigned long address) { if (kvm_stage2_has_pud(kvm)) - return pud_offset(pgd, address); + return pud_offset(p4d, address); else - return (pud_t *)pgd; + return (pud_t *)p4d; } static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 5b73e92c99e3..a8a4b55f3a09 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -184,6 +184,7 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgprot_t pgprot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -196,7 +197,15 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgd_populate(&init_mm, pgdp, pudp); } - pudp = pud_offset(pgdp, dst_addr); + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = (void *)get_safe_page(GFP_ATOMIC); + if (!pudp) + return -ENOMEM; + p4d_populate(&init_mm, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); if (pud_none(READ_ONCE(*pudp))) { pmdp = (void *)get_safe_page(GFP_ATOMIC); if (!pmdp) @@ -419,7 +428,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, return 0; } -static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, +static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, unsigned long end) { pud_t *dst_pudp; @@ -427,15 +436,15 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, unsigned long next; unsigned long addr = start; - if (pgd_none(READ_ONCE(*dst_pgdp))) { + if (p4d_none(READ_ONCE(*dst_p4dp))) { dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); if (!dst_pudp) return -ENOMEM; - pgd_populate(&init_mm, dst_pgdp, dst_pudp); + p4d_populate(&init_mm, dst_p4dp, dst_pudp); } - dst_pudp = pud_offset(dst_pgdp, start); + dst_pudp = pud_offset(dst_p4dp, start); - src_pudp = pud_offset(src_pgdp, start); + src_pudp = pud_offset(src_p4dp, start); do { pud_t pud = READ_ONCE(*src_pudp); @@ -454,6 +463,27 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, return 0; } +static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, unsigned long end) { @@ -466,7 +496,7 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, next = pgd_addr_end(addr, end); if (pgd_none(READ_ONCE(*src_pgdp))) continue; - if (copy_pud(dst_pgdp, src_pgdp, addr, next)) + if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) return -ENOMEM; } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a1f6bc70c4e4..290154e32c0b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -158,13 +158,22 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); stage2_pgd_clear(kvm, pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(kvm, pud_table); + stage2_p4d_free(kvm, p4d_table); put_page(virt_to_page(pgd)); } +static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr) +{ + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); + stage2_p4d_clear(kvm, p4d); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pud_free(kvm, pud_table); + put_page(virt_to_page(p4d)); +} + static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); @@ -208,12 +217,20 @@ static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) dsb(ishst); } -static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) +static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) { - WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); + WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); dsb(ishst); } +static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) +{ +#ifndef __PAGETABLE_P4D_FOLDED + WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); + dsb(ishst); +#endif +} + /* * Unmapping vs dcache management: * @@ -293,13 +310,13 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = stage2_pud_offset(kvm, pgd, addr); + start_pud = pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -317,6 +334,23 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); if (stage2_pud_table_empty(kvm, start_pud)) + clear_stage2_p4d_entry(kvm, p4d, start_addr); +} + +static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + p4d_t *p4d, *start_p4d; + + start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + unmap_stage2_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + if (stage2_p4d_table_empty(kvm, start_p4d)) clear_stage2_pgd_entry(kvm, pgd, start_addr); } @@ -351,7 +385,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) break; next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_puds(kvm, pgd, addr, next); + unmap_stage2_p4ds(kvm, pgd, addr, next); /* * If the range is too large, release the kvm->mmu_lock * to prevent starvation and lockup detector warnings. @@ -391,13 +425,13 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, } while (pmd++, addr = next, addr != end); } -static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, +static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(kvm, pgd, addr); + pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -409,6 +443,20 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); } +static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + p4d_t *p4d; + phys_addr_t next; + + p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + stage2_flush_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -421,7 +469,7 @@ static void stage2_flush_memslot(struct kvm *kvm, do { next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_puds(kvm, pgd, addr, next); + stage2_flush_p4ds(kvm, pgd, addr, next); if (next != end) cond_resched_lock(&kvm->mmu_lock); @@ -454,12 +502,21 @@ static void stage2_flush_vm(struct kvm *kvm) static void clear_hyp_pgd_entry(pgd_t *pgd) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); pgd_clear(pgd); - pud_free(NULL, pud_table); + p4d_free(NULL, p4d_table); put_page(virt_to_page(pgd)); } +static void clear_hyp_p4d_entry(p4d_t *p4d) +{ + pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); + VM_BUG_ON(p4d_huge(*p4d)); + p4d_clear(p4d); + pud_free(NULL, pud_table); + put_page(virt_to_page(p4d)); +} + static void clear_hyp_pud_entry(pud_t *pud) { pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); @@ -511,12 +568,12 @@ static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) clear_hyp_pud_entry(pud); } -static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { phys_addr_t next; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); /* Hyp doesn't use huge puds */ @@ -525,6 +582,23 @@ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) } while (pud++, addr = next, addr != end); if (hyp_pud_table_empty(start_pud)) + clear_hyp_p4d_entry(p4d); +} + +static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + p4d_t *p4d, *start_p4d; + + start_p4d = p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + /* Hyp doesn't use huge p4ds */ + if (!p4d_none(*p4d)) + unmap_hyp_puds(p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + if (hyp_p4d_table_empty(start_p4d)) clear_hyp_pgd_entry(pgd); } @@ -548,7 +622,7 @@ static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, do { next = pgd_addr_end(addr, end); if (!pgd_none(*pgd)) - unmap_hyp_puds(pgd, addr, next); + unmap_hyp_p4ds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -658,7 +732,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, return 0; } -static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, +static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { @@ -669,7 +743,7 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, addr = start; do { - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) { pmd = pmd_alloc_one(NULL, addr); @@ -691,12 +765,45 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, return 0; } +static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + p4d_t *p4d; + pud_t *pud; + unsigned long addr, next; + int ret; + + addr = start; + do { + p4d = p4d_offset(pgd, addr); + + if (p4d_none(*p4d)) { + pud = pud_alloc_one(NULL, addr); + if (!pud) { + kvm_err("Cannot allocate Hyp pud\n"); + return -ENOMEM; + } + kvm_p4d_populate(p4d, pud); + get_page(virt_to_page(p4d)); + } + + next = p4d_addr_end(addr, end); + ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); + if (ret) + return ret; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; unsigned long addr, next; int err = 0; @@ -707,18 +814,18 @@ static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); if (pgd_none(*pgd)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); + p4d = p4d_alloc_one(NULL, addr); + if (!p4d) { + kvm_err("Cannot allocate Hyp p4d\n"); err = -ENOMEM; goto out; } - kvm_pgd_populate(pgd, pud); + kvm_pgd_populate(pgd, p4d); get_page(virt_to_page(pgd)); } next = pgd_addr_end(addr, end); - err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); + err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); if (err) goto out; pfn += (next - addr) >> PAGE_SHIFT; @@ -1015,22 +1122,40 @@ void kvm_free_stage2_pgd(struct kvm *kvm) free_pages_exact(pgd, stage2_pgd_size(kvm)); } -static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, +static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr) { pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; - pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, pud); + p4d = mmu_memory_cache_alloc(cache); + stage2_pgd_populate(kvm, pgd, p4d); get_page(virt_to_page(pgd)); } - return stage2_pud_offset(kvm, pgd, addr); + return stage2_p4d_offset(kvm, pgd, addr); +} + +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + p4d_t *p4d; + pud_t *pud; + + p4d = stage2_get_p4d(kvm, cache, addr); + if (stage2_p4d_none(kvm, *p4d)) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); + stage2_p4d_populate(kvm, p4d, pud); + get_page(virt_to_page(p4d)); + } + + return stage2_pud_offset(kvm, p4d, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1423,18 +1548,18 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, } /** - * stage2_wp_puds - write protect PGD range + * stage2_wp_puds - write protect P4D range * @pgd: pointer to pgd entry * @addr: range start address * @end: range end address */ -static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, +static void stage2_wp_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(kvm, pgd, addr); + pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -1448,6 +1573,26 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); } +/** + * stage2_wp_p4ds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + p4d_t *p4d; + phys_addr_t next; + + p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + stage2_wp_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + /** * stage2_wp_range() - write protect stage2 memory region range * @kvm: The KVM pointer @@ -1475,7 +1620,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) break; next = stage2_pgd_addr_end(kvm, addr, end); if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_puds(kvm, pgd, addr, next); + stage2_wp_p4ds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index dff2d72b0883..df8ae73d950b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -145,6 +145,7 @@ static void show_pte(unsigned long addr) pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd)); do { + p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; @@ -152,7 +153,13 @@ static void show_pte(unsigned long addr) if (pgd_none(pgd) || pgd_bad(pgd)) break; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + p4d = READ_ONCE(*p4dp); + pr_cont(", p4d=%016llx", p4d_val(p4d)); + if (p4d_none(p4d) || p4d_bad(p4d)) + break; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); pr_cont(", pud=%016llx", pud_val(pud)); if (pud_none(pud) || pud_bad(pud)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 07f154b8b84a..0a52ce46f020 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -67,11 +67,13 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { pgd_t *pgdp = pgd_offset(mm, addr); + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; *pgsize = PAGE_SIZE; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_offset(p4dp, addr); pmdp = pmd_offset(pudp, addr); if ((pte_t *)pmdp == ptep) { *pgsize = PMD_SIZE; @@ -217,12 +219,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep = NULL; pgdp = pgd_offset(mm, addr); - pudp = pud_alloc(mm, pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return NULL; @@ -261,6 +265,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; @@ -268,7 +273,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (!pgd_present(READ_ONCE(*pgdp))) return NULL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (!p4d_present(READ_ONCE(*p4dp))) + return NULL; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (sz != PUD_SIZE && pud_none(pud)) return NULL; diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index f87a32484ea8..2339811f317b 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -84,17 +84,17 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); } -static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node, +static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) { - if (pgd_none(READ_ONCE(*pgdp))) { + if (p4d_none(READ_ONCE(*p4dp))) { phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); - __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE); + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); } - return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr); + return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); } static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, @@ -126,11 +126,11 @@ static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, } while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp))); } -static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, +static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early); + pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early); do { next = pud_addr_end(addr, end); @@ -138,6 +138,18 @@ static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, } while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp))); } +static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + p4d_t *p4dp = p4d_offset(pgdp, addr); + + do { + next = p4d_addr_end(addr, end); + kasan_pud_populate(p4dp, addr, next, node, early); + } while (p4dp++, addr = next, addr != end); +} + static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, int node, bool early) { @@ -147,7 +159,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, pgdp = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - kasan_pud_populate(pgdp, addr, next, node, early); + kasan_p4d_populate(pgdp, addr, next, node, early); } while (pgdp++, addr = next, addr != end); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c299b73dd5e4..e7fbc6275329 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -290,18 +290,19 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, { unsigned long next; pud_t *pudp; - pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t p4d = READ_ONCE(*p4dp); - if (pgd_none(pgd)) { + if (p4d_none(p4d)) { phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(PUD_SHIFT); - __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE); - pgd = READ_ONCE(*pgdp); + __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE); + p4d = READ_ONCE(*p4dp); } - BUG_ON(pgd_bad(pgd)); + BUG_ON(p4d_bad(p4d)); - pudp = pud_set_fixmap_offset(pgdp, addr); + pudp = pud_set_fixmap_offset(p4dp, addr); do { pud_t old_pud = READ_ONCE(*pudp); @@ -672,6 +673,7 @@ static void __init map_kernel(pgd_t *pgdp) READ_ONCE(*pgd_offset_k(FIXADDR_START))); } else if (CONFIG_PGTABLE_LEVELS > 3) { pgd_t *bm_pgdp; + p4d_t *bm_p4dp; pud_t *bm_pudp; /* * The fixmap shares its top level pgd entry with the kernel @@ -681,7 +683,8 @@ static void __init map_kernel(pgd_t *pgdp) */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START); - bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START); + bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START); + bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START); pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd)); pud_clear_fixmap(); } else { @@ -715,6 +718,7 @@ void __init paging_init(void) int kern_addr_valid(unsigned long addr) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; @@ -726,7 +730,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(READ_ONCE(*pgdp))) return 0; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_none(READ_ONCE(*p4dp))) + return 0; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) return 0; @@ -1069,6 +1077,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, unsigned long addr = start; unsigned long next; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; @@ -1079,7 +1088,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, if (!pgdp) return -ENOMEM; - pudp = vmemmap_pud_populate(pgdp, addr, node); + p4dp = vmemmap_p4d_populate(pgdp, addr, node); + if (!p4dp) + return -ENOMEM; + + pudp = vmemmap_pud_populate(p4dp, addr, node); if (!pudp) return -ENOMEM; @@ -1114,11 +1127,12 @@ void vmemmap_free(unsigned long start, unsigned long end, static inline pud_t * fixmap_pud(unsigned long addr) { pgd_t *pgdp = pgd_offset_k(addr); - pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t p4d = READ_ONCE(*p4dp); - BUG_ON(pgd_none(pgd) || pgd_bad(pgd)); + BUG_ON(p4d_none(p4d) || p4d_bad(p4d)); - return pud_offset_kimg(pgdp, addr); + return pud_offset_kimg(p4dp, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) @@ -1144,25 +1158,27 @@ static inline pte_t * fixmap_pte(unsigned long addr) */ void __init early_fixmap_init(void) { - pgd_t *pgdp, pgd; + pgd_t *pgdp; + p4d_t *p4dp, p4d; pud_t *pudp; pmd_t *pmdp; unsigned long addr = FIXADDR_START; pgdp = pgd_offset_k(addr); - pgd = READ_ONCE(*pgdp); + p4dp = p4d_offset(pgdp, addr); + p4d = READ_ONCE(*p4dp); if (CONFIG_PGTABLE_LEVELS > 3 && - !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) { + !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on * 16k/4 levels configurations. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - pudp = pud_offset_kimg(pgdp, addr); + pudp = pud_offset_kimg(p4dp, addr); } else { - if (pgd_none(pgd)) - __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); + if (p4d_none(p4d)) + __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); pudp = fixmap_pud(addr); } if (pud_none(READ_ONCE(*pudp))) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index bde08090b838..4175bcb8ccb3 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -198,6 +198,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) bool kernel_page_present(struct page *page) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep; @@ -210,7 +211,11 @@ bool kernel_page_present(struct page *page) if (pgd_none(READ_ONCE(*pgdp))) return false; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_none(READ_ONCE(*p4dp))) + return false; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) return false; -- cgit v1.2.3 From 00b13def5c03525c25084441f2b5f974a8751dad Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:27 -0700 Subject: hexagon: remove __ARCH_USE_5LEVEL_HACK The hexagon architecture has 2 level page tables and as such most of the page table folding is already implemented in asm-generic/pgtable-nopmd.h. Fixup the only place in arch/hexagon to unfold the p4d level and remove __ARCH_USE_5LEVEL_HACK. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-5-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/fixmap.h | 4 ++-- arch/hexagon/include/asm/pgtable.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/hexagon/include/asm/fixmap.h b/arch/hexagon/include/asm/fixmap.h index 933dac167504..97b1b062e750 100644 --- a/arch/hexagon/include/asm/fixmap.h +++ b/arch/hexagon/include/asm/fixmap.h @@ -16,7 +16,7 @@ #include #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), \ - (vaddr)), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), \ + (vaddr)), (vaddr)), (vaddr)), (vaddr)) #endif diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index d383e8bea5b2..2a17d4eb2fa4 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -12,7 +12,6 @@ * Page table definitions for Qualcomm Hexagon processor. */ #include -#define __ARCH_USE_5LEVEL_HACK #include /* A handy thing to have if one has the RAM. Declared in head.S */ -- cgit v1.2.3 From c03ab9e32a2c4fadbb65a07e88fbb4a25f9ab9d6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:31 -0700 Subject: ia64: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate, remove usage of __ARCH_USE_5LEVEL_HACK and replace 5level-fixup.h with pgtable-nop4d.h Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-6-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgalloc.h | 4 ++-- arch/ia64/include/asm/pgtable.h | 17 ++++++++--------- arch/ia64/mm/fault.c | 7 ++++++- arch/ia64/mm/hugetlbpage.c | 18 ++++++++++++------ arch/ia64/mm/init.c | 28 ++++++++++++++++++++++++---- 5 files changed, 52 insertions(+), 22 deletions(-) diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index f4c491044882..2a3050345099 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -36,9 +36,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #if CONFIG_PGTABLE_LEVELS == 4 static inline void -pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) +p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud) { - pgd_val(*pgd_entry) = __pa(pud); + p4d_val(*p4d_entry) = __pa(pud); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 0e7b645b76c6..787b0a91d255 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -283,12 +283,12 @@ extern unsigned long VMALLOC_END; #define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET)) #if CONFIG_PGTABLE_LEVELS == 4 -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) -#define pgd_present(pgd) (pgd_val(pgd) != 0UL) -#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) -#define pgd_page(pgd) virt_to_page((pgd_val(pgd) + PAGE_OFFSET)) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (!ia64_phys_addr_valid(p4d_val(p4d))) +#define p4d_present(p4d) (p4d_val(p4d) != 0UL) +#define p4d_clear(p4dp) (p4d_val(*(p4dp)) = 0UL) +#define p4d_page_vaddr(p4d) ((unsigned long) __va(p4d_val(p4d) & _PFN_MASK)) +#define p4d_page(p4d) virt_to_page((p4d_val(p4d) + PAGE_OFFSET)) #endif /* @@ -386,7 +386,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address) #if CONFIG_PGTABLE_LEVELS == 4 /* Find an entry in the second-level page table.. */ #define pud_offset(dir,addr) \ - ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + ((pud_t *) p4d_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) #endif /* Find an entry in the third-level page table.. */ @@ -580,10 +580,9 @@ extern struct page *zero_page_memmap_ptr; #if CONFIG_PGTABLE_LEVELS == 3 -#define __ARCH_USE_5LEVEL_HACK #include #endif -#include +#include #include #endif /* _ASM_IA64_PGTABLE_H */ diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 30d0c1fca99e..12242aa0dad1 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -29,6 +29,7 @@ static int mapped_kernel_page_is_present (unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; @@ -37,7 +38,11 @@ mapped_kernel_page_is_present (unsigned long address) if (pgd_none(*pgd) || pgd_bad(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return 0; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_bad(*pud)) return 0; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index d16e419fd712..32352a73df0c 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -30,12 +30,14 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); - pud = pud_alloc(mm, pgd, taddr); + p4d = p4d_offset(pgd, taddr); + pud = pud_alloc(mm, p4d, taddr); if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) @@ -49,17 +51,21 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); if (pgd_present(*pgd)) { - pud = pud_offset(pgd, taddr); - if (pud_present(*pud)) { - pmd = pmd_offset(pud, taddr); - if (pmd_present(*pmd)) - pte = pte_offset_map(pmd, taddr); + p4d = p4d_offset(pgd, addr); + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, taddr); + if (pud_present(*pud)) { + pmd = pmd_offset(pud, taddr); + if (pmd_present(*pmd)) + pte = pte_offset_map(pmd, taddr); + } } } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d637b4ea3147..ca760f6cb18f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -208,6 +208,7 @@ static struct page * __init put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -215,7 +216,10 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ { - pud = pud_alloc(&init_mm, pgd, address); + p4d = p4d_alloc(&init_mm, pgd, address); + if (!p4d) + goto out; + pud = pud_alloc(&init_mm, p4d, address); if (!pud) goto out; pmd = pmd_alloc(&init_mm, pud, address); @@ -382,6 +386,7 @@ int vmemmap_find_next_valid_pfn(int node, int i) do { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +397,13 @@ int vmemmap_find_next_valid_pfn(int node, int i) continue; } - pud = pud_offset(pgd, end_address); + p4d = p4d_offset(pgd, end_address); + if (p4d_none(*p4d)) { + end_address += P4D_SIZE; + continue; + } + + pud = pud_offset(p4d, end_address); if (pud_none(*pud)) { end_address += PUD_SIZE; continue; @@ -430,6 +441,7 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) struct page *map_start, *map_end; int node; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -444,12 +456,20 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) for (address = start_page; address < end_page; address += PAGE_SIZE) { pgd = pgd_offset_k(address); if (pgd_none(*pgd)) { + p4d = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); + if (!p4d) + goto err_alloc; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, address); + + if (p4d_none(*p4d)) { pud = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); if (!pud) goto err_alloc; - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); } - pud = pud_offset(pgd, address); + pud = pud_offset(p4d, address); if (pud_none(*pud)) { pmd = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); -- cgit v1.2.3 From 9f4e70379f609d3bcc9d67eafecda75a495ffb4b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:35 -0700 Subject: nios2: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-7-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/nios2/include/asm/pgtable.h | 3 +-- arch/nios2/mm/fault.c | 9 +++++++-- arch/nios2/mm/ioremap.c | 6 +++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index f98b7f4519ba..47a1a3ea5734 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -22,7 +22,6 @@ #include #include -#define __ARCH_USE_5LEVEL_HACK #include #define FIRST_USER_ADDRESS 0UL @@ -100,7 +99,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; */ static inline void set_pmd(pmd_t *pmdptr, pmd_t pmdval) { - pmdptr->pud.pgd.pgd = pmdval.pud.pgd.pgd; + *pmdptr = pmdval; } /* to find an entry in a page-table-directory */ diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index ec9d8a9c426f..964eac1a21d0 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -242,6 +242,7 @@ vmalloc_fault: */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; @@ -253,8 +254,12 @@ vmalloc_fault: goto no_context; set_pgd(pgd, *pgd_k); - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto no_context; + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); diff --git a/arch/nios2/mm/ioremap.c b/arch/nios2/mm/ioremap.c index 819bdfcc2e71..fe821efb9a99 100644 --- a/arch/nios2/mm/ioremap.c +++ b/arch/nios2/mm/ioremap.c @@ -86,11 +86,15 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, if (address >= end) BUG(); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; error = -ENOMEM; - pud = pud_alloc(&init_mm, dir, address); + p4d = p4d_alloc(&init_mm, dir, address); + if (!p4d) + break; + pud = pud_alloc(&init_mm, p4d, address); if (!pud) break; pmd = pmd_alloc(&init_mm, pud, address); -- cgit v1.2.3 From b187fb7fca9cc420d111b2e058b69fae5eea18bc Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:39 -0700 Subject: openrisc: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-8-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/openrisc/include/asm/pgtable.h | 1 - arch/openrisc/mm/fault.c | 10 ++++++++-- arch/openrisc/mm/init.c | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 7f3fb9ceb083..219979e57790 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -21,7 +21,6 @@ #ifndef __ASM_OPENRISC_PGTABLE_H #define __ASM_OPENRISC_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #ifndef __ASSEMBLY__ diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 8af1cc78c4fb..6e0a11ac4c00 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -295,6 +295,7 @@ vmalloc_fault: int offset = pgd_index(address); pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; @@ -321,8 +322,13 @@ vmalloc_fault: * it exists. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto no_context; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto no_context; diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index f94fe6d3f499..3bcdc1c26b23 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -68,6 +68,7 @@ static void __init map_ram(void) unsigned long v, p, e; pgprot_t prot; pgd_t *pge; + p4d_t *p4e; pud_t *pue; pmd_t *pme; pte_t *pte; @@ -87,7 +88,8 @@ static void __init map_ram(void) while (p < e) { int j; - pue = pud_offset(pge, v); + p4e = p4d_offset(pge, v); + pue = pud_offset(p4e, v); pme = pmd_offset(pue, v); if ((u32) pue != (u32) pge || (u32) pme != (u32) pge) { -- cgit v1.2.3 From 2fb4706057bcf8261b3b0521ec7a62b54b82ce48 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:44 -0700 Subject: powerpc: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h. [rppt@linux.ibm.com: powerpc/xmon: drop unused pgdir varialble in show_pte() function] Link: http://lkml.kernel.org/r/20200519181454.GI1059226@linux.ibm.com [rppt@linux.ibm.com; build fix] Link: http://lkml.kernel.org/r/20200423141845.GI13521@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Christophe Leroy # 8xx and 83xx Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-9-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/32/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/hash.h | 4 +- arch/powerpc/include/asm/book3s/64/pgalloc.h | 4 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 60 ++++++++++++++----------- arch/powerpc/include/asm/book3s/64/radix.h | 6 +-- arch/powerpc/include/asm/nohash/32/pgtable.h | 1 - arch/powerpc/include/asm/nohash/64/pgalloc.h | 2 +- arch/powerpc/include/asm/nohash/64/pgtable-4k.h | 32 ++++++------- arch/powerpc/include/asm/nohash/64/pgtable.h | 6 +-- arch/powerpc/include/asm/pgtable.h | 10 ++--- arch/powerpc/kvm/book3s_64_mmu_radix.c | 32 +++++++------ arch/powerpc/lib/code-patching.c | 7 ++- arch/powerpc/mm/book3s64/hash_pgtable.c | 4 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 26 +++++++---- arch/powerpc/mm/book3s64/subpage_prot.c | 6 ++- arch/powerpc/mm/hugetlbpage.c | 28 +++++++----- arch/powerpc/mm/kasan/kasan_init_32.c | 2 +- arch/powerpc/mm/nohash/book3e_pgtable.c | 15 ++++--- arch/powerpc/mm/pgtable.c | 30 ++++++++----- arch/powerpc/mm/pgtable_64.c | 10 ++--- arch/powerpc/mm/ptdump/hashpagetable.c | 20 +++++++-- arch/powerpc/mm/ptdump/ptdump.c | 12 ++--- arch/powerpc/xmon/xmon.c | 27 ++++++----- 23 files changed, 200 insertions(+), 145 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7549393c4c43..6052b72216a6 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -2,7 +2,6 @@ #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #include diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 6fc4520092c7..73ad038ed10b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea) #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) -static inline int hash__pgd_bad(pgd_t pgd) +static inline int hash__p4d_bad(p4d_t p4d) { - return (pgd_val(pgd) == 0); + return (p4d_val(p4d) == 0); } #ifdef CONFIG_STRICT_KERNEL_RWX extern void hash__mark_rodata_ro(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index a41e91bd0580..69c5b051734f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud) { - *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS); + *pgd = __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index d6438659926c..87168eb9490c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ -#include +#include #ifndef __ASSEMBLY__ #include @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift; /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0xc0000000000000ffUL /* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL +#define P4D_MASKED_BITS 0xc0000000000000ffUL /* * Used as an indicator for rcu callback functions @@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return pte_access_permitted(pud_pte(pud), write); } -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define __p4d_raw(x) ((p4d_t) { __pgd_raw(x) }) +static inline __be64 p4d_raw(p4d_t x) +{ + return pgd_raw(x.pgd); +} + +#define p4d_write(p4d) pte_write(p4d_pte(p4d)) -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); } -static inline int pgd_none(pgd_t pgd) +static inline int p4d_none(p4d_t p4d) { - return !pgd_raw(pgd); + return !p4d_raw(p4d); } -static inline int pgd_present(pgd_t pgd) +static inline int p4d_present(p4d_t p4d) { - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); + return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT)); } -static inline pte_t pgd_pte(pgd_t pgd) +static inline pte_t p4d_pte(p4d_t p4d) { - return __pte_raw(pgd_raw(pgd)); + return __pte_raw(p4d_raw(p4d)); } -static inline pgd_t pte_pgd(pte_t pte) +static inline p4d_t pte_p4d(pte_t pte) { - return __pgd_raw(pte_raw(pte)); + return __p4d_raw(pte_raw(pte)); } -static inline int pgd_bad(pgd_t pgd) +static inline int p4d_bad(p4d_t p4d) { if (radix_enabled()) - return radix__pgd_bad(pgd); - return hash__pgd_bad(pgd); + return radix__p4d_bad(p4d); + return hash__p4d_bad(p4d); } -#define pgd_access_permitted pgd_access_permitted -static inline bool pgd_access_permitted(pgd_t pgd, bool write) +#define p4d_access_permitted p4d_access_permitted +static inline bool p4d_access_permitted(p4d_t p4d, bool write) { - return pte_access_permitted(pgd_pte(pgd), write); + return pte_access_permitted(p4d_pte(p4d), write); } -extern struct page *pgd_page(pgd_t pgd); +extern struct page *p4d_page(p4d_t p4d); /* Pointers in the page table tree are physical addresses */ #define __pgtable_ptr_val(ptr) __pa(ptr) #define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) #define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) -#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) +#define p4d_page_vaddr(p4d) __va(p4d_val(p4d) & ~P4D_MASKED_BITS) #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) @@ -1010,8 +1016,8 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define pud_offset(pgdp, addr) \ - (((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr)) +#define pud_offset(p4dp, addr) \ + (((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr)) #define pmd_offset(pudp,addr) \ (((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr)) #define pte_offset_kernel(dir,addr) \ @@ -1366,11 +1372,11 @@ static inline bool pud_is_leaf(pud_t pud) return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } -#define pgd_is_leaf pgd_is_leaf -#define pgd_leaf pgd_is_leaf -static inline bool pgd_is_leaf(pgd_t pgd) +#define p4d_is_leaf p4d_is_leaf +#define p4d_leaf p4d_is_leaf +static inline bool p4d_is_leaf(p4d_t p4d) { - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); + return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE)); } #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 08c222d5b764..0cba794c4fb8 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -30,7 +30,7 @@ /* Don't have anything in the reserved bits and leaf bits */ #define RADIX_PMD_BAD_BITS 0x60000000000000e0UL #define RADIX_PUD_BAD_BITS 0x60000000000000e0UL -#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL +#define RADIX_P4D_BAD_BITS 0x60000000000000e0UL #define RADIX_PMD_SHIFT (PAGE_SHIFT + RADIX_PTE_INDEX_SIZE) #define RADIX_PUD_SHIFT (RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE) @@ -227,9 +227,9 @@ static inline int radix__pud_bad(pud_t pud) } -static inline int radix__pgd_bad(pgd_t pgd) +static inline int radix__p4d_bad(p4d_t p4d) { - return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); + return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b04ba257fddb..3d0bc99dd520 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -2,7 +2,6 @@ #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H #define _ASM_POWERPC_NOHASH_32_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index b9534a793293..668aee6017e7 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -15,7 +15,7 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) +#define p4d_populate(MM, P4D, PUD) p4d_set(P4D, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h index c40ec32b8194..81b1c54e3cf1 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H -#include +#include /* * Entries per page directory level. The PTE level must use a 64b record @@ -45,41 +45,41 @@ #define PMD_MASKED_BITS 0 /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0 -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0 +/* Bits to mask out from a P4D to get to the PUD page */ +#define P4D_MASKED_BITS 0 /* * 4-level page tables related bits */ -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (pgd_val(pgd) == 0) -#define pgd_present(pgd) (pgd_val(pgd) != 0) -#define pgd_page_vaddr(pgd) (pgd_val(pgd) & ~PGD_MASKED_BITS) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (p4d_val(p4d) == 0) +#define p4d_present(p4d) (p4d_val(p4d) != 0) +#define p4d_page_vaddr(p4d) (p4d_val(p4d) & ~P4D_MASKED_BITS) #ifndef __ASSEMBLY__ -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); } -static inline pte_t pgd_pte(pgd_t pgd) +static inline pte_t p4d_pte(p4d_t p4d) { - return __pte(pgd_val(pgd)); + return __pte(p4d_val(p4d)); } -static inline pgd_t pte_pgd(pte_t pte) +static inline p4d_t pte_p4d(pte_t pte) { - return __pgd(pte_val(pte)); + return __p4d(pte_val(pte)); } -extern struct page *pgd_page(pgd_t pgd); +extern struct page *p4d_page(p4d_t p4d); #endif /* !__ASSEMBLY__ */ -#define pud_offset(pgdp, addr) \ - (((pud_t *) pgd_page_vaddr(*(pgdp))) + \ +#define pud_offset(p4dp, addr) \ + (((pud_t *) p4d_page_vaddr(*(p4dp))) + \ (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) #define pud_ERROR(e) \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 9a33b8bd842d..b360f262b9c6 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -175,11 +175,11 @@ static inline pud_t pte_pud(pte_t pte) return __pud(pte_val(pte)); } #define pud_write(pud) pte_write(pud_pte(pud)) -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define p4d_write(pgd) pte_write(p4d_pte(p4d)) -static inline void pgd_set(pgd_t *pgdp, unsigned long val) +static inline void p4d_set(p4d_t *p4dp, unsigned long val) { - *pgdp = __pgd(val); + *p4dp = __p4d(val); } /* diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index b1f1d5339735..bad9b324559d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,12 +44,12 @@ struct mm_struct; #ifdef CONFIG_PPC32 static inline pmd_t *pmd_ptr(struct mm_struct *mm, unsigned long va) { - return pmd_offset(pud_offset(pgd_offset(mm, va), va), va); + return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_ptr_k(unsigned long va) { - return pmd_offset(pud_offset(pgd_offset_k(va), va), va); + return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) @@ -158,9 +158,9 @@ static inline bool pud_is_leaf(pud_t pud) } #endif -#ifndef pgd_is_leaf -#define pgd_is_leaf pgd_is_leaf -static inline bool pgd_is_leaf(pgd_t pgd) +#ifndef p4d_is_leaf +#define p4d_is_leaf p4d_is_leaf +static inline bool p4d_is_leaf(p4d_t p4d) { return false; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index aa12cd4078b3..d605ed0bb2e7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -499,13 +499,14 @@ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) unsigned long ig; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + p4d_t *p4d = p4d_offset(pgd, 0); pud_t *pud; - if (!pgd_present(*pgd)) + if (!p4d_present(*p4d)) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(p4d, 0); kvmppc_unmap_free_pud(kvm, pud, lpid); - pgd_clear(pgd); + p4d_clear(p4d); } } @@ -566,6 +567,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud, *new_pud = NULL; pmd_t *pmd, *new_pmd = NULL; pte_t *ptep, *new_ptep = NULL; @@ -573,9 +575,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Traverse the guest's 2nd-level tree, allocate new levels needed */ pgd = pgtable + pgd_index(gpa); + p4d = p4d_offset(pgd, gpa); + pud = NULL; - if (pgd_present(*pgd)) - pud = pud_offset(pgd, gpa); + if (p4d_present(*p4d)) + pud = pud_offset(p4d, gpa); else new_pud = pud_alloc_one(kvm->mm, gpa); @@ -596,13 +600,13 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Now traverse again under the lock and change the tree */ ret = -ENOMEM; - if (pgd_none(*pgd)) { + if (p4d_none(*p4d)) { if (!new_pud) goto out_unlock; - pgd_populate(kvm->mm, pgd, new_pud); + p4d_populate(kvm->mm, p4d, new_pud); new_pud = NULL; } - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d, gpa); if (pud_is_leaf(*pud)) { unsigned long hgpa = gpa & PUD_MASK; @@ -1220,7 +1224,8 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, unsigned long gpa; pgd_t *pgt; struct kvm_nested_guest *nested; - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ptep; @@ -1293,13 +1298,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, } pgdp = pgt + pgd_index(gpa); - pgd = READ_ONCE(*pgdp); - if (!(pgd_val(pgd) & _PAGE_PRESENT)) { - gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + p4dp = p4d_offset(pgdp, gpa); + p4d = READ_ONCE(*p4dp); + if (!(p4d_val(p4d) & _PAGE_PRESENT)) { + gpa = (gpa & P4D_MASK) + P4D_SIZE; continue; } - pudp = pud_offset(&pgd, gpa); + pudp = pud_offset(&p4d, gpa); pud = READ_ONCE(*pudp); if (!(pud_val(pud) & _PAGE_PRESENT)) { gpa = (gpa & PUD_MASK) + PUD_SIZE; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3345f039a876..7a59f6863cec 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr) pte_t *ptep; pmd_t *pmdp; pud_t *pudp; + p4d_t *p4dp; pgd_t *pgdp; pgdp = pgd_offset_k(addr); if (unlikely(!pgdp)) return -EINVAL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (unlikely(!p4dp)) + return -EINVAL; + + pudp = pud_offset(p4dp, addr); if (unlikely(!pudp)) return -EINVAL; diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64733b9cb20a..9cd15937e88a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 8f9edf07063a..97891ca0d428 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -65,17 +65,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); + p4d_populate(&init_mm, p4dp, pudp); } - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (map_page_size == PUD_SIZE) { ptep = (pte_t *)pudp; goto set_the_pte; @@ -115,6 +117,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -137,7 +140,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, * boot. */ pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; if (map_page_size == PUD_SIZE) { @@ -174,6 +178,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end, { unsigned long idx; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -186,7 +191,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end, for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); + p4dp = p4d_offset(pgdp, idx); + pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; if (pud_is_leaf(*pudp)) { @@ -850,6 +856,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) unsigned long addr, next; pud_t *pud_base; pgd_t *pgd; + p4d_t *p4d; spin_lock(&init_mm.page_table_lock); @@ -857,15 +864,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) continue; - if (pgd_is_leaf(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + if (p4d_is_leaf(*p4d)) { + split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d); continue; } - pud_base = (pud_t *)pgd_page_vaddr(*pgd); + pud_base = (pud_t *)p4d_page_vaddr(*p4d); remove_pud_table(pud_base, addr, next); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 2ef24a53f4c9..25a0c044bd93 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) return; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return; pmd = pmd_offset(pud, addr); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 4d5ed1093615..f122d0f2c295 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -119,6 +119,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pg; + p4d_t *p4; pud_t *pu; pmd_t *pm; hugepd_t *hpdp = NULL; @@ -128,20 +129,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz addr &= ~(sz-1); pg = pgd_offset(mm, addr); + p4 = p4d_offset(pg, addr); #ifdef CONFIG_PPC_BOOK3S_64 if (pshift == PGDIR_SHIFT) /* 16GB huge page */ - return (pte_t *) pg; + return (pte_t *) p4; else if (pshift > PUD_SHIFT) { /* * We need to use hugepd table */ ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)pg; + hpdp = (hugepd_t *)p4; } else { pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); + pu = pud_alloc(mm, p4, addr); if (!pu) return NULL; if (pshift == PUD_SHIFT) @@ -166,10 +168,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz #else if (pshift >= PGDIR_SHIFT) { ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)pg; + hpdp = (hugepd_t *)p4; } else { pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); + pu = pud_alloc(mm, p4, addr); if (!pu) return NULL; if (pshift >= PUD_SHIFT) { @@ -390,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, mm_dec_nr_pmds(tlb->mm); } -static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -400,7 +402,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, start = addr; do { - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); next = pud_addr_end(addr, end); if (!is_hugepd(__hugepd(pud_val(*pud)))) { if (pud_none_or_clear_bad(pud)) @@ -435,8 +437,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - pud = pud_offset(pgd, start); - pgd_clear(pgd); + pud = pud_offset(p4d, start); + p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } @@ -449,6 +451,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; + p4d_t *p4d; unsigned long next; /* @@ -471,10 +474,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, do { next = pgd_addr_end(addr, end); pgd = pgd_offset(tlb->mm, addr); + p4d = p4d_offset(pgd, addr); if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { - if (pgd_none_or_clear_bad(pgd)) + if (p4d_none_or_clear_bad(p4d)) continue; - hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); } else { unsigned long more; /* @@ -487,7 +491,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, if (more > next) next = more; - free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, addr, next, floor, ceiling); } } while (addr = next, addr != end); diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index cbcad369fcb2..c99aa8cbaac5 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -121,7 +121,7 @@ static void __init kasan_unmap_early_shadow_vmalloc(void) phys_addr_t pa = __pa(kasan_early_shadow_page); for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { - pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pmd_t *pmd = pmd_ptr_k(k_cur); pte_t *ptep = pte_offset_kernel(pmd, k_cur); if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 4637fdd469cf..77884e24281d 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size) int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); @@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) return -ENOMEM; } else { pgdp = pgd_offset_k(ea); -#ifndef __PAGETABLE_PUD_FOLDED - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - pgd_populate(&init_mm, pgdp, pudp); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pmdp); } -#endif /* !__PAGETABLE_PUD_FOLDED */ - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (pud_none(*pudp)) { pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); pud_populate(&init_mm, pudp, pmdp); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index e3759b69f81b..c2499271f6c1 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -265,6 +265,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, void assert_pte_locked(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -272,7 +273,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) return; pgd = mm->pgd + pgd_index(addr); BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, addr); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); /* @@ -312,12 +315,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys); pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *hpage_shift) { - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ret_pte; hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; + unsigned pdshift; if (hpage_shift) *hpage_shift = 0; @@ -325,24 +329,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (is_thp) *is_thp = false; - pgdp = pgdir + pgd_index(ea); - pgd = READ_ONCE(*pgdp); /* * Always operate on the local stack value. This make sure the * value don't get updated by a parallel THP split/collapse, * page fault or a page unmap. The return pte_t * is still not * stable. So should be checked there for above conditions. + * Top level is an exception because it is folded into p4d. */ - if (pgd_none(pgd)) + pgdp = pgdir + pgd_index(ea); + p4dp = p4d_offset(pgdp, ea); + p4d = READ_ONCE(*p4dp); + pdshift = P4D_SHIFT; + + if (p4d_none(p4d)) return NULL; - if (pgd_is_leaf(pgd)) { - ret_pte = (pte_t *)pgdp; + if (p4d_is_leaf(p4d)) { + ret_pte = (pte_t *)p4dp; goto out; } - if (is_hugepd(__hugepd(pgd_val(pgd)))) { - hpdp = (hugepd_t *)&pgd; + if (is_hugepd(__hugepd(p4d_val(p4d)))) { + hpdp = (hugepd_t *)&p4d; goto out_huge; } @@ -352,7 +360,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, * irq disabled */ pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); + pudp = pud_offset(&p4d, ea); pud = READ_ONCE(*pudp); if (pud_none(pud)) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e78832dce7bb..1f86a88fd4bb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift); #ifndef __PAGETABLE_PUD_FOLDED /* 4 level page table */ -struct page *pgd_page(pgd_t pgd) +struct page *p4d_page(p4d_t p4d) { - if (pgd_is_leaf(pgd)) { - VM_WARN_ON(!pgd_huge(pgd)); - return pte_page(pgd_pte(pgd)); + if (p4d_is_leaf(p4d)) { + VM_WARN_ON(!p4d_huge(p4d)); + return pte_page(p4d_pte(p4d)); } - return virt_to_page(pgd_page_vaddr(pgd)); + return virt_to_page(p4d_page_vaddr(p4d)); } #endif diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index b6ed9578382f..6aaeb1eb3b9c 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned int i; @@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) } } +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) + /* p4d exists */ + walk_pud(st, p4d, addr); + } +} + static void walk_pagetables(struct pg_state *st) { pgd_t *pgd = pgd_offset_k(0UL); @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st) addr = KERN_VIRT_START + i * PGDIR_SIZE; if (!pgd_none(*pgd)) /* pgd exists */ - walk_pud(st, pgd, addr); + walk_p4d(st, pgd, addr); } } diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index d92bb8ea229c..b3fead0230c1 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned int i; @@ -304,11 +304,13 @@ static void walk_pagetables(struct pg_state *st) * the hash pagetable. */ for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd)) + p4d_t *p4d = p4d_offset(pgd, 0); + + if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d)) /* pgd exists */ - walk_pud(st, pgd, addr); + walk_pud(st, p4d, addr); else - note_page(st, addr, 1, pgd_val(*pgd)); + note_page(st, addr, 1, p4d_val(*p4d)); } } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 7af840c0fc93..89415b84c597 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3135,7 +3135,8 @@ static void show_pte(unsigned long addr) unsigned long tskv = 0; struct task_struct *tsk = NULL; struct mm_struct *mm; - pgd_t *pgdp, *pgdir; + pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -3159,28 +3160,26 @@ static void show_pte(unsigned long addr) catch_memory_errors = 1; sync(); - if (mm == &init_mm) { + if (mm == &init_mm) pgdp = pgd_offset_k(addr); - pgdir = pgd_offset_k(0); - } else { + else pgdp = pgd_offset(mm, addr); - pgdir = pgd_offset(mm, 0); - } - if (pgd_none(*pgdp)) { - printf("no linux page table for address\n"); + p4dp = p4d_offset(pgdp, addr); + + if (p4d_none(*p4dp)) { + printf("No valid P4D\n"); return; } - printf("pgd @ 0x%px\n", pgdir); - - if (pgd_is_leaf(*pgdp)) { - format_pte(pgdp, pgd_val(*pgdp)); + if (p4d_is_leaf(*p4dp)) { + format_pte(p4dp, p4d_val(*p4dp)); return; } - printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp)); - pudp = pud_offset(pgdp, addr); + printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp)); + + pudp = pud_offset(p4dp, addr); if (pud_none(*pudp)) { printf("No valid PUD\n"); -- cgit v1.2.3 From eaabf98b0932a540f3c772e4243e140ec239302c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 4 Jun 2020 16:46:48 -0700 Subject: sh: fault: modernize printing of kernel messages - Convert from printk() to pr_*(), - Add missing continuations, - Use "%llx" to format u64, - Join multiple prints in show_fault_oops() into a single print. Signed-off-by: Geert Uytterhoeven Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-10-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/sh/mm/fault.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 5f23d7907597..ffcdf8fbc303 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -47,10 +47,10 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) pgd = swapper_pg_dir; } - printk(KERN_ALERT "pgd = %p\n", pgd); + pr_alert("pgd = %p\n", pgd); pgd += pgd_index(addr); - printk(KERN_ALERT "[%08lx] *pgd=%0*Lx", addr, - (u32)(sizeof(*pgd) * 2), (u64)pgd_val(*pgd)); + pr_alert("[%08lx] *pgd=%0*llx", addr, (u32)(sizeof(*pgd) * 2), + (u64)pgd_val(*pgd)); do { pud_t *pud; @@ -61,33 +61,33 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; if (pgd_bad(*pgd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } pud = pud_offset(pgd, addr); if (PTRS_PER_PUD != 1) - printk(", *pud=%0*Lx", (u32)(sizeof(*pud) * 2), - (u64)pud_val(*pud)); + pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2), + (u64)pud_val(*pud)); if (pud_none(*pud)) break; if (pud_bad(*pud)) { - printk("(bad)"); + pr_cont("(bad)"); break; } pmd = pmd_offset(pud, addr); if (PTRS_PER_PMD != 1) - printk(", *pmd=%0*Lx", (u32)(sizeof(*pmd) * 2), - (u64)pmd_val(*pmd)); + pr_cont(", *pmd=%0*llx", (u32)(sizeof(*pmd) * 2), + (u64)pmd_val(*pmd)); if (pmd_none(*pmd)) break; if (pmd_bad(*pmd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } @@ -96,11 +96,11 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; pte = pte_offset_kernel(pmd, addr); - printk(", *pte=%0*Lx", (u32)(sizeof(*pte) * 2), - (u64)pte_val(*pte)); + pr_cont(", *pte=%0*llx", (u32)(sizeof(*pte) * 2), + (u64)pte_val(*pte)); } while (0); - printk("\n"); + pr_cont("\n"); } static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) @@ -188,14 +188,12 @@ show_fault_oops(struct pt_regs *regs, unsigned long address) if (!oops_may_print()) return; - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); - else - printk(KERN_CONT "paging request"); - - printk(KERN_CONT " at %08lx\n", address); printk(KERN_ALERT "PC:"); + pr_alert("BUG: unable to handle kernel %s at %08lx\n", + address < PAGE_SIZE ? "NULL pointer dereference" + : "paging request", + address); + pr_alert("PC:"); printk_address(regs->pc, 1); show_pte(NULL, address); -- cgit v1.2.3 From a194a62650de5f74795a8edf0db5d8fdf1218aeb Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:52 -0700 Subject: sh: drop __pXd_offset() macros that duplicate pXd_index() ones The __pXd_offset() macros are identical to the pXd_index() macros and there is no point to keep both of them. All architectures define and use pXd_index() so let's keep only those to make mips consistent with the rest of the kernel. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-11-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgtable_32.h | 5 ++--- arch/sh/include/asm/pgtable_64.h | 5 ++--- arch/sh/mm/init.c | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 29274f0e428e..4acce5f2cbf9 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -407,13 +407,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* to find an entry in a page-table-directory. */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define __pgd_offset(address) pgd_index(address) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define __pud_offset(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) /* Find an entry in the third-level page table.. */ #define pte_index(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h index 1778bc5971e7..27cc282ec6c0 100644 --- a/arch/sh/include/asm/pgtable_64.h +++ b/arch/sh/include/asm/pgtable_64.h @@ -46,14 +46,13 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) /* To find an entry in a generic PGD. */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define __pgd_offset(address) pgd_index(address) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) /* To find an entry in a kernel PGD. */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define __pud_offset(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +/* #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) */ /* * PMD level access routines. Same notes as above. diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 628f461b8993..ddeeaa567600 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -172,9 +172,9 @@ void __init page_table_range_init(unsigned long start, unsigned long end, unsigned long vaddr; vaddr = start; - i = __pgd_offset(vaddr); - j = __pud_offset(vaddr); - k = __pmd_offset(vaddr); + i = pgd_index(vaddr); + j = pud_index(vaddr); + k = pmd_index(vaddr); pgd = pgd_base + i; for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { -- cgit v1.2.3 From 874e2cc18972d30ecd4d572d1286fe9b594d309c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:46:56 -0700 Subject: sh: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-12-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgtable-2level.h | 1 - arch/sh/include/asm/pgtable-3level.h | 1 - arch/sh/kernel/io_trapped.c | 7 ++++++- arch/sh/mm/cache-sh4.c | 4 +++- arch/sh/mm/cache-sh5.c | 7 ++++++- arch/sh/mm/fault.c | 26 +++++++++++++++++++++++--- arch/sh/mm/hugetlbpage.c | 28 ++++++++++++++++++---------- arch/sh/mm/init.c | 9 ++++++++- arch/sh/mm/kmap.c | 2 +- arch/sh/mm/tlbex_32.c | 6 +++++- arch/sh/mm/tlbex_64.c | 7 ++++++- 11 files changed, 76 insertions(+), 22 deletions(-) diff --git a/arch/sh/include/asm/pgtable-2level.h b/arch/sh/include/asm/pgtable-2level.h index bf1eb51c3ee5..08bff93927ff 100644 --- a/arch/sh/include/asm/pgtable-2level.h +++ b/arch/sh/include/asm/pgtable-2level.h @@ -2,7 +2,6 @@ #ifndef __ASM_SH_PGTABLE_2LEVEL_H #define __ASM_SH_PGTABLE_2LEVEL_H -#define __ARCH_USE_5LEVEL_HACK #include /* diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 779260b721ca..0f80097e5c9c 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -2,7 +2,6 @@ #ifndef __ASM_SH_PGTABLE_3LEVEL_H #define __ASM_SH_PGTABLE_3LEVEL_H -#define __ARCH_USE_5LEVEL_HACK #include /* diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index 60c828a2b8a2..037aab2708b7 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -136,6 +136,7 @@ EXPORT_SYMBOL_GPL(match_trapped_io_handler); static struct trapped_io *lookup_tiop(unsigned long address) { pgd_t *pgd_k; + p4d_t *p4d_k; pud_t *pud_k; pmd_t *pmd_k; pte_t *pte_k; @@ -145,7 +146,11 @@ static struct trapped_io *lookup_tiop(unsigned long address) if (!pgd_present(*pgd_k)) return NULL; - pud_k = pud_offset(pgd_k, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index eee911422cf9..45943bcb7042 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -209,6 +209,7 @@ static void sh4_flush_cache_page(void *args) unsigned long address, pfn, phys; int map_coherent = 0; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -224,7 +225,8 @@ static void sh4_flush_cache_page(void *args) return; pgd = pgd_offset(vma->vm_mm, address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c index 445b5e69b73c..442a77cc2957 100644 --- a/arch/sh/mm/cache-sh5.c +++ b/arch/sh/mm/cache-sh5.c @@ -383,6 +383,7 @@ static void sh64_dcache_purge_user_pages(struct mm_struct *mm, unsigned long addr, unsigned long end) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -397,7 +398,11 @@ static void sh64_dcache_purge_user_pages(struct mm_struct *mm, if (pgd_bad(*pgd)) return; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return; + + pud = pud_offset(p4d, addr); if (pud_none(*pud) || pud_bad(*pud)) return; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index ffcdf8fbc303..7260a1a7fdca 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -53,6 +53,7 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) (u64)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -65,7 +66,20 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; } - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (PTRS_PER_P4D != 1) + pr_cont(", *p4d=%0*Lx", (u32)(sizeof(*p4d) * 2), + (u64)p4d_val(*p4d)); + + if (p4d_none(*p4d)) + break; + + if (p4d_bad(*p4d)) { + pr_cont("(bad)"); + break; + } + + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2), (u64)pud_val(*pud)); @@ -107,6 +121,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -116,8 +131,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pgd_present(*pgd_k)) return NULL; - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 960deb1f24a1..acd5652a0de3 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -26,17 +26,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (pgd) { - pud = pud_alloc(mm, pgd, addr); - if (pud) { - pmd = pmd_alloc(mm, pud, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (p4d) { + pud = pud_alloc(mm, p4d, addr); + if (pud) { + pmd = pmd_alloc(mm, pud, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); + } } } @@ -47,17 +51,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (pgd) { - pud = pud_offset(pgd, addr); - if (pud) { - pmd = pmd_offset(pud, addr); - if (pmd) - pte = pte_offset_map(pmd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d) { + pud = pud_offset(p4d, addr); + if (pud) { + pmd = pmd_offset(pud, addr); + if (pmd) + pte = pte_offset_map(pmd, addr); + } } } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index ddeeaa567600..a70ba0fdd0b3 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -45,6 +45,7 @@ void __init __weak plat_mem_setup(void) static pte_t *__get_pte_phys(unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -54,7 +55,13 @@ static pte_t *__get_pte_phys(unsigned long addr) return NULL; } - pud = pud_alloc(NULL, pgd, addr); + p4d = p4d_alloc(NULL, pgd, addr); + if (unlikely(!p4d)) { + p4d_ERROR(*p4d); + return NULL; + } + + pud = pud_alloc(NULL, p4d, addr); if (unlikely(!pud)) { pud_ERROR(*pud); return NULL; diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index 9e6b38b03cf7..0e7039137f5a 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -15,7 +15,7 @@ #include #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr)), vaddr) static pte_t *kmap_coherent_pte; diff --git a/arch/sh/mm/tlbex_32.c b/arch/sh/mm/tlbex_32.c index 382262dc0c4b..1c53868632ee 100644 --- a/arch/sh/mm/tlbex_32.c +++ b/arch/sh/mm/tlbex_32.c @@ -23,6 +23,7 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -42,7 +43,10 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, pgd = pgd_offset(current->mm, address); } - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none_or_clear_bad(p4d)) + return 1; + pud = pud_offset(p4d, address); if (pud_none_or_clear_bad(pud)) return 1; pmd = pmd_offset(pud, address); diff --git a/arch/sh/mm/tlbex_64.c b/arch/sh/mm/tlbex_64.c index 8ff966dd0c74..0d015f7556fa 100644 --- a/arch/sh/mm/tlbex_64.c +++ b/arch/sh/mm/tlbex_64.c @@ -44,6 +44,7 @@ static int handle_tlbmiss(unsigned long long protection_flags, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -58,7 +59,11 @@ static int handle_tlbmiss(unsigned long long protection_flags, pgd = pgd_offset(current->mm, address); } - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || !p4d_present(*p4d)) + return 1; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || !pud_present(*pud)) return 1; -- cgit v1.2.3 From 453668afbf9906cf8d2b927eb9306c8bd108c6c4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:47:00 -0700 Subject: unicore32: remove __ARCH_USE_5LEVEL_HACK The unicore32 architecture has 2 level page tables and asm-generic/pgtable-nopmd.h and explicit casts from pud_t to pgd_t for page table folding. Add p4d walk in the only place that actually unfolds the pud level and remove __ARCH_USE_5LEVEL_HACK. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-13-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/pgtable.h | 1 - arch/unicore32/kernel/hibernate.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 3b8731b3a937..826f49edd94e 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -9,7 +9,6 @@ #ifndef __UNICORE_PGTABLE_H__ #define __UNICORE_PGTABLE_H__ -#define __ARCH_USE_5LEVEL_HACK #include #include diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c index f3812245cc00..ccad051a79b6 100644 --- a/arch/unicore32/kernel/hibernate.c +++ b/arch/unicore32/kernel/hibernate.c @@ -33,9 +33,11 @@ struct swsusp_arch_regs swsusp_arch_regs_cpu0; static pmd_t *resume_one_md_table_init(pgd_t *pgd) { pud_t *pud; + p4d_t *p4d; pmd_t *pmd_table; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; -- cgit v1.2.3 From ee7767430ec27ac10b3d476e8a7735ca850fbd7d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:47:04 -0700 Subject: asm-generic: remove pgtable-nop4d-hack.h No architecture defines __ARCH_USE_5LEVEL_HACK and therefore pgtable-nop4d-hack.h will be never actually included. Remove it. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-14-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable-nop4d-hack.h | 64 -------------------------------- include/asm-generic/pgtable-nopud.h | 4 -- 2 files changed, 68 deletions(-) delete mode 100644 include/asm-generic/pgtable-nop4d-hack.h diff --git a/include/asm-generic/pgtable-nop4d-hack.h b/include/asm-generic/pgtable-nop4d-hack.h deleted file mode 100644 index 829bdb0d6327..000000000000 --- a/include/asm-generic/pgtable-nop4d-hack.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PGTABLE_NOP4D_HACK_H -#define _PGTABLE_NOP4D_HACK_H - -#ifndef __ASSEMBLY__ -#include - -#define __PAGETABLE_PUD_FOLDED 1 - -/* - * Having the pud type consist of a pgd gets the size right, and allows - * us to conceptually access the pgd entry that this pud is folded into - * without casting. - */ -typedef struct { pgd_t pgd; } pud_t; - -#define PUD_SHIFT PGDIR_SHIFT -#define PTRS_PER_PUD 1 -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* - * The "pgd_xxx()" functions here are trivial for a folded two-level - * setup: the pud is never bad, and a pud always exists (as it's folded - * into the pgd entry) - */ -static inline int pgd_none(pgd_t pgd) { return 0; } -static inline int pgd_bad(pgd_t pgd) { return 0; } -static inline int pgd_present(pgd_t pgd) { return 1; } -static inline void pgd_clear(pgd_t *pgd) { } -#define pud_ERROR(pud) (pgd_ERROR((pud).pgd)) - -#define pgd_populate(mm, pgd, pud) do { } while (0) -#define pgd_populate_safe(mm, pgd, pud) do { } while (0) -/* - * (puds are folded into pgds so this doesn't get actually called, - * but the define is needed for a generic inline function.) - */ -#define set_pgd(pgdptr, pgdval) set_pud((pud_t *)(pgdptr), (pud_t) { pgdval }) - -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) -{ - return (pud_t *)pgd; -} - -#define pud_val(x) (pgd_val((x).pgd)) -#define __pud(x) ((pud_t) { __pgd(x) }) - -#define pgd_page(pgd) (pud_page((pud_t){ pgd })) -#define pgd_page_vaddr(pgd) (pud_page_vaddr((pud_t){ pgd })) - -/* - * allocating and freeing a pud is trivial: the 1-entry pud is - * inside the pgd, so has no extra memory associated with it. - */ -#define pud_alloc_one(mm, address) NULL -#define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x, a) do { } while (0) - -#undef pud_addr_end -#define pud_addr_end(addr, end) (end) - -#endif /* __ASSEMBLY__ */ -#endif /* _PGTABLE_NOP4D_HACK_H */ diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index d3776cb494c0..ad05c1684bfc 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -4,9 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef __ARCH_USE_5LEVEL_HACK -#include -#else #include #define __PAGETABLE_PUD_FOLDED 1 @@ -65,5 +62,4 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) #define pud_addr_end(addr, end) (end) #endif /* __ASSEMBLY__ */ -#endif /* !__ARCH_USE_5LEVEL_HACK */ #endif /* _PGTABLE_NOPUD_H */ -- cgit v1.2.3 From f089dcc74226b874a4d4b122854e0dea91ff72d8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:47:08 -0700 Subject: mm: remove __ARCH_HAS_5LEVEL_HACK and include/asm-generic/5level-fixup.h There are no architectures that use include/asm-generic/5level-fixup.h therefore it can be removed along with __ARCH_HAS_5LEVEL_HACK define and the code it surrounds Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-15-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 59 -------------------------------------- include/linux/mm.h | 7 ----- mm/kasan/init.c | 11 ------- mm/memory.c | 8 ------ 4 files changed, 85 deletions(-) delete mode 100644 include/asm-generic/5level-fixup.h diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h deleted file mode 100644 index 58046ddc08d0..000000000000 --- a/include/asm-generic/5level-fixup.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _5LEVEL_FIXUP_H -#define _5LEVEL_FIXUP_H - -#define __ARCH_HAS_5LEVEL_HACK -#define __PAGETABLE_P4D_FOLDED 1 - -#define P4D_SHIFT PGDIR_SHIFT -#define P4D_SIZE PGDIR_SIZE -#define P4D_MASK PGDIR_MASK -#define MAX_PTRS_PER_P4D 1 -#define PTRS_PER_P4D 1 - -#define p4d_t pgd_t - -#define pud_alloc(mm, p4d, address) \ - ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ - NULL : pud_offset(p4d, address)) - -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_alloc_track(mm, pgd, address, mask) (pgd) -#define p4d_offset(pgd, start) (pgd) - -#ifndef __ASSEMBLY__ -static inline int p4d_none(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_bad(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_present(p4d_t p4d) -{ - return 1; -} -#endif - -#define p4d_ERROR(p4d) do { } while (0) -#define p4d_clear(p4d) pgd_clear(p4d) -#define p4d_val(p4d) pgd_val(p4d) -#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_populate_safe(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_page(p4d) pgd_page(p4d) -#define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d) - -#define __p4d(x) __pgd(x) -#define set_p4d(p4dp, p4d) set_pgd(p4dp, p4d) - -#undef p4d_free_tlb -#define p4d_free_tlb(tlb, x, addr) do { } while (0) -#define p4d_free(mm, x) do { } while (0) - -#undef p4d_addr_end -#define p4d_addr_end(addr, end) (end) - -#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 66e0977f970a..e220ce5185ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2069,11 +2069,6 @@ int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) -/* - * The following ifdef needed to get the 5level-fixup.h header to work. - * Remove it when 5level-fixup.h has been removed. - */ -#ifndef __ARCH_HAS_5LEVEL_HACK static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2102,8 +2097,6 @@ static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, return p4d_offset(pgd, address); } -#endif /* !__ARCH_HAS_5LEVEL_HACK */ - static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ce45c491ebcd..fe6be0be1f76 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. - * - * The ifndef is required to avoid build breakage. - * - * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_early_shadow_p4d. It's not defined - * unless 5-level paging enabled. - * - * The ifndef can be dropped once all KASAN-enabled - * architectures will switch to pgtable-nop4d.h. */ -#ifndef __ARCH_HAS_5LEVEL_HACK pgd_populate(&init_mm, pgd, lm_alias(kasan_early_shadow_p4d)); -#endif p4d = p4d_offset(pgd, addr); p4d_populate(&init_mm, p4d, lm_alias(kasan_early_shadow_pud)); diff --git a/mm/memory.c b/mm/memory.c index 7b70398f76a0..60c279295fce 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4436,19 +4436,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); -#ifndef __ARCH_HAS_5LEVEL_HACK if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); -#else - if (!pgd_present(*p4d)) { - mm_inc_nr_puds(mm); - pgd_populate(mm, p4d, new); - } else /* Another has populated it */ - pud_free(mm, new); -#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } -- cgit v1.2.3 From 8898ad58a0195e2d3f5ffb770fbe9c9dc978a171 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 4 Jun 2020 16:47:12 -0700 Subject: x86/mm: define mm_p4d_folded() Patch series "mm/debug: Add tests validating architecture page table helpers", v18. This adds a test validation for architecture exported page table helpers. Patch adds basic transformation tests at various levels of the page table. This test was originally suggested by Catalin during arm64 THP migration RFC discussion earlier. Going forward it can include more specific tests with respect to various generic MM functions like THP, HugeTLB etc and platform specific tests. https://lore.kernel.org/linux-mm/20190628102003.GA56463@arrakis.emea.arm.com/ This patch (of 2): This just defines mm_p4d_folded() to check whether P4D page table level is folded at runtime. Signed-off-by: Kirill A. Shutemov Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1587436495-22033-2-git-send-email-anshuman.khandual@arm.com Link: http://lkml.kernel.org/r/1588564865-31160-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index df1373415f11..8d03ffd43794 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -53,6 +53,12 @@ static inline void sync_initial_page_table(void) { } struct mm_struct; +#define mm_p4d_folded mm_p4d_folded +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return !pgtable_l5_enabled(); +} + void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); -- cgit v1.2.3 From 399145f9eb6c670daa605f0a823f836761e560ae Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 4 Jun 2020 16:47:15 -0700 Subject: mm/debug: add tests validating architecture page table helpers This adds tests which will validate architecture page table helpers and other accessors in their compliance with expected generic MM semantics. This will help various architectures in validating changes to existing page table helpers or addition of new ones. This test covers basic page table entry transformations including but not limited to old, young, dirty, clean, write, write protect etc at various level along with populating intermediate entries with next page table page and validating them. Test page table pages are allocated from system memory with required size and alignments. The mapped pfns at page table levels are derived from a real pfn representing a valid kernel text symbol. This test gets called via late_initcall(). This test gets built and run when CONFIG_DEBUG_VM_PGTABLE is selected. Any architecture, which is willing to subscribe this test will need to select ARCH_HAS_DEBUG_VM_PGTABLE. For now this is limited to arc, arm64, x86, s390 and powerpc platforms where the test is known to build and run successfully Going forward, other architectures too can subscribe the test after fixing any build or runtime problems with their page table helpers. Folks interested in making sure that a given platform's page table helpers conform to expected generic MM semantics should enable the above config which will just trigger this test during boot. Any non conformity here will be reported as an warning which would need to be fixed. This test will help catch any changes to the agreed upon semantics expected from generic MM and enable platforms to accommodate it thereafter. [anshuman.khandual@arm.com: v17] Link: http://lkml.kernel.org/r/1587436495-22033-3-git-send-email-anshuman.khandual@arm.com [anshuman.khandual@arm.com: v18] Link: http://lkml.kernel.org/r/1588564865-31160-3-git-send-email-anshuman.khandual@arm.com Suggested-by: Catalin Marinas Signed-off-by: Anshuman Khandual Signed-off-by: Christophe Leroy Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Tested-by: Gerald Schaefer [s390] Tested-by: Christophe Leroy [ppc32] Reviewed-by: Ingo Molnar Cc: Mike Rapoport Cc: Vineet Gupta Cc: Catalin Marinas Cc: Will Deacon Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/1583919272-24178-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- .../debug/debug-vm-pgtable/arch-support.txt | 34 ++ arch/arc/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/x86/Kconfig | 1 + lib/Kconfig.debug | 22 ++ mm/Makefile | 1 + mm/debug_vm_pgtable.c | 382 +++++++++++++++++++++ 9 files changed, 444 insertions(+) create mode 100644 Documentation/features/debug/debug-vm-pgtable/arch-support.txt create mode 100644 mm/debug_vm_pgtable.c diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt new file mode 100644 index 000000000000..c527d05c0459 --- /dev/null +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -0,0 +1,34 @@ +# +# Feature name: debug-vm-pgtable +# Kconfig: ARCH_HAS_DEBUG_VM_PGTABLE +# description: arch supports pgtable tests for semantics compliance +# + ----------------------- + | arch |status| + ----------------------- + | alpha: | TODO | + | arc: | ok | + | arm: | TODO | + | arm64: | ok | + | c6x: | TODO | + | csky: | TODO | + | h8300: | TODO | + | hexagon: | TODO | + | ia64: | TODO | + | m68k: | TODO | + | microblaze: | TODO | + | mips: | TODO | + | nds32: | TODO | + | nios2: | TODO | + | openrisc: | TODO | + | parisc: | TODO | + | powerpc: | ok | + | riscv: | TODO | + | s390: | ok | + | sh: | TODO | + | sparc: | TODO | + | um: | TODO | + | unicore32: | TODO | + | x86: | ok | + | xtensa: | TODO | + ----------------------- diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ff306246d0f8..471ef22216c4 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d0bc8bae7c8d..7f9d38444d6d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -12,6 +12,7 @@ config ARM64 select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a8eee7a64add..c4f36a0b6b6e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,6 +116,7 @@ config PPC # select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d6dc6933adc2..f854faff38c3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -59,6 +59,7 @@ config KASAN_SHADOW_OFFSET config S390 def_bool y select ARCH_BINFMT_ELF_STATE + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 57d1c4e36738..1a54aeb40626 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -59,6 +59,7 @@ config X86 select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d07ab3e056cd..30302c9b0f83 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -658,6 +658,12 @@ config SCHED_STACK_END_CHECK data corruption or a sporadic crash at a later stage once the region is examined. The runtime overhead introduced is minimal. +config ARCH_HAS_DEBUG_VM_PGTABLE + bool + help + An architecture should select this when it can successfully + build and run DEBUG_VM_PGTABLE. + config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL @@ -693,6 +699,22 @@ config DEBUG_VM_PGFLAGS If unsure, say N. +config DEBUG_VM_PGTABLE + bool "Debug arch page table for semantics compliance" + depends on MMU + depends on ARCH_HAS_DEBUG_VM_PGTABLE + default y if DEBUG_VM + help + This option provides a debug method which can be used to test + architecture page table helper functions on various platforms in + verifying if they comply with expected generic MM semantics. This + will help architecture code in making sure that any changes or + new additions of these helpers still conform to expected + semantics of the generic MM. Platforms will have to opt in for + this through ARCH_HAS_DEBUG_VM_PGTABLE. + + If unsure, say N. + config ARCH_HAS_DEBUG_VIRTUAL bool diff --git a/mm/Makefile b/mm/Makefile index fccd3756b25f..662fd1504646 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o +obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c new file mode 100644 index 000000000000..188c18908964 --- /dev/null +++ b/mm/debug_vm_pgtable.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This kernel test validates architecture page table helpers and + * accessors and helps in verifying their continued compliance with + * expected generic MM semantics. + * + * Copyright (C) 2019 ARM Ltd. + * + * Author: Anshuman Khandual + */ +#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) + +/* + * On s390 platform, the lower 4 bits are used to identify given page table + * entry type. But these bits might affect the ability to clear entries with + * pxx_clear() because of how dynamic page table folding works on s390. So + * while loading up the entries do not change the lower 4 bits. It does not + * have affect any other platform. + */ +#define S390_MASK_BITS 4 +#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define RANDOM_NZVALUE GENMASK(7, 0) + +static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pte_t pte = pfn_pte(pfn, prot); + + WARN_ON(!pte_same(pte, pte)); + WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); + WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte)))); + WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); + WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pmd_t pmd = pfn_pmd(pfn, prot); + + WARN_ON(!pmd_same(pmd, pmd)); + WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); + WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd)))); + WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); + WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pmd_bad(). + */ + WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pud_t pud = pfn_pud(pfn, prot); + + WARN_ON(!pud_same(pud, pud)); + WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); + WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); + WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + + if (mm_pmd_folded(mm)) + return; + + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pud_bad(). + */ + WARN_ON(!pud_bad(pud_mkhuge(pud))); +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +{ + p4d_t p4d; + + memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t)); + WARN_ON(!p4d_same(p4d, p4d)); +} + +static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pgd_t pgd; + + memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t)); + WARN_ON(!pgd_same(pgd, pgd)); +} + +#ifndef __PAGETABLE_PUD_FOLDED +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +{ + pud_t pud = READ_ONCE(*pudp); + + if (mm_pmd_folded(mm)) + return; + + pud = __pud(pud_val(pud) | RANDOM_ORVALUE); + WRITE_ONCE(*pudp, pud); + pud_clear(pudp); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +} + +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ + pud_t pud; + + if (mm_pmd_folded(mm)) + return; + /* + * This entry points to next level page table page. + * Hence this must not qualify as pud_bad(). + */ + pmd_clear(pmdp); + pud_clear(pudp); + pud_populate(mm, pudp, pmdp); + pud = READ_ONCE(*pudp); + WARN_ON(pud_bad(pud)); +} +#else /* !__PAGETABLE_PUD_FOLDED */ +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ +} +#endif /* PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_P4D_FOLDED +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +{ + p4d_t p4d = READ_ONCE(*p4dp); + + if (mm_pud_folded(mm)) + return; + + p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); + WRITE_ONCE(*p4dp, p4d); + p4d_clear(p4dp); + p4d = READ_ONCE(*p4dp); + WARN_ON(!p4d_none(p4d)); +} + +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ + p4d_t p4d; + + if (mm_pud_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as p4d_bad(). + */ + pud_clear(pudp); + p4d_clear(p4dp); + p4d_populate(mm, p4dp, pudp); + p4d = READ_ONCE(*p4dp); + WARN_ON(p4d_bad(p4d)); +} + +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = READ_ONCE(*pgdp); + + if (mm_p4d_folded(mm)) + return; + + pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); + WRITE_ONCE(*pgdp, pgd); + pgd_clear(pgdp); + pgd = READ_ONCE(*pgdp); + WARN_ON(!pgd_none(pgd)); +} + +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ + pgd_t pgd; + + if (mm_p4d_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pgd_bad(). + */ + p4d_clear(p4dp); + pgd_clear(pgdp); + pgd_populate(mm, pgdp, p4dp); + pgd = READ_ONCE(*pgdp); + WARN_ON(pgd_bad(pgd)); +} +#else /* !__PAGETABLE_P4D_FOLDED */ +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ +} +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ +} +#endif /* PAGETABLE_P4D_FOLDED */ + +static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, + unsigned long vaddr) +{ + pte_t pte = READ_ONCE(*ptep); + + pte = __pte(pte_val(pte) | RANDOM_ORVALUE); + set_pte_at(mm, vaddr, ptep, pte); + barrier(); + pte_clear(mm, vaddr, ptep); + pte = READ_ONCE(*ptep); + WARN_ON(!pte_none(pte)); +} + +static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +{ + pmd_t pmd = READ_ONCE(*pmdp); + + pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); + WRITE_ONCE(*pmdp, pmd); + pmd_clear(pmdp); + pmd = READ_ONCE(*pmdp); + WARN_ON(!pmd_none(pmd)); +} + +static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pmd_t pmd; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pmd_bad(). + */ + pmd_clear(pmdp); + pmd_populate(mm, pmdp, pgtable); + pmd = READ_ONCE(*pmdp); + WARN_ON(pmd_bad(pmd)); +} + +static unsigned long __init get_random_vaddr(void) +{ + unsigned long random_vaddr, random_pages, total_user_pages; + + total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE; + + random_pages = get_random_long() % total_user_pages; + random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE; + + return random_vaddr; +} + +static int __init debug_vm_pgtable(void) +{ + struct mm_struct *mm; + pgd_t *pgdp; + p4d_t *p4dp, *saved_p4dp; + pud_t *pudp, *saved_pudp; + pmd_t *pmdp, *saved_pmdp, pmd; + pte_t *ptep; + pgtable_t saved_ptep; + pgprot_t prot; + phys_addr_t paddr; + unsigned long vaddr, pte_aligned, pmd_aligned; + unsigned long pud_aligned, p4d_aligned, pgd_aligned; + spinlock_t *uninitialized_var(ptl); + + pr_info("Validating architecture page table helpers\n"); + prot = vm_get_page_prot(VMFLAGS); + vaddr = get_random_vaddr(); + mm = mm_alloc(); + if (!mm) { + pr_err("mm_struct allocation failed\n"); + return 1; + } + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels. + */ + paddr = __pa_symbol(&start_kernel); + + pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; + pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; + pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; + p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; + pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; + WARN_ON(!pfn_valid(pte_aligned)); + + pgdp = pgd_offset(mm, vaddr); + p4dp = p4d_alloc(mm, pgdp, vaddr); + pudp = pud_alloc(mm, p4dp, vaddr); + pmdp = pmd_alloc(mm, pudp, vaddr); + ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + + /* + * Save all the page table page addresses as the page table + * entries will be used for testing with random or garbage + * values. These saved addresses will be used for freeing + * page table pages. + */ + pmd = READ_ONCE(*pmdp); + saved_p4dp = p4d_offset(pgdp, 0UL); + saved_pudp = pud_offset(p4dp, 0UL); + saved_pmdp = pmd_offset(pudp, 0UL); + saved_ptep = pmd_pgtable(pmd); + + pte_basic_tests(pte_aligned, prot); + pmd_basic_tests(pmd_aligned, prot); + pud_basic_tests(pud_aligned, prot); + p4d_basic_tests(p4d_aligned, prot); + pgd_basic_tests(pgd_aligned, prot); + + pte_clear_tests(mm, ptep, vaddr); + pmd_clear_tests(mm, pmdp); + pud_clear_tests(mm, pudp); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + + pte_unmap_unlock(ptep, ptl); + + pmd_populate_tests(mm, pmdp, saved_ptep); + pud_populate_tests(mm, pudp, saved_pmdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + + p4d_free(mm, saved_p4dp); + pud_free(mm, saved_pudp); + pmd_free(mm, saved_pmdp); + pte_free(mm, saved_ptep); + + mm_dec_nr_puds(mm); + mm_dec_nr_pmds(mm); + mm_dec_nr_ptes(mm); + mmdrop(mm); + return 0; +} +late_initcall(debug_vm_pgtable); -- cgit v1.2.3 From 73221d8887241103b4e6193b9c41114256ed4a18 Mon Sep 17 00:00:00 2001 From: Jeongtae Park Date: Thu, 4 Jun 2020 16:47:19 -0700 Subject: mm/vmalloc: fix a typo in comment There is a typo in comment, fix it. "nother" -> "another" Signed-off-by: Jeongtae Park Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Cc: Andrey Ryabinin Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200604185239.20765-1-jtp.park@samsung.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1e94497b7388..3091c2ca60df 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2317,7 +2317,7 @@ static inline void __vfree_deferred(const void *addr) * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to - * nother cpu's list. schedule_work() should be fine with this too. + * another cpu's list. schedule_work() should be fine with this too. */ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); -- cgit v1.2.3 From 01c4b788e01bfdb715de9a6763a7515da7a30a8f Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:22 -0700 Subject: arch/kmap: remove BUG_ON() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Remove duplicated kmap code", v3. The kmap infrastructure has been copied almost verbatim to every architecture. This series consolidates obvious duplicated code by defining core functions which call into the architectures only when needed. Some of the k[un]map_atomic() implementations have some similarities but the similarities were not sufficient to warrant further changes. In addition we remove a duplicate implementation of kmap() in DRM. This patch (of 15): Replace the use of BUG_ON(in_interrupt()) in the kmap() and kunmap() in favor of might_sleep(). Besides the benefits of might_sleep(), this normalizes the implementations such that they can be made generic in subsequent patches. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Dan Williams Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Christian König Cc: Daniel Vetter Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Chris Zankel Cc: Max Filippov Link: http://lkml.kernel.org/r/20200507150004.1423069-1-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20200507150004.1423069-2-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 +- arch/arc/mm/highmem.c | 2 +- arch/arm/mm/highmem.c | 2 +- arch/csky/mm/highmem.c | 2 +- arch/microblaze/include/asm/highmem.h | 2 +- arch/mips/mm/highmem.c | 2 +- arch/nds32/mm/highmem.c | 2 +- arch/powerpc/include/asm/highmem.h | 2 +- arch/sparc/include/asm/highmem.h | 4 ++-- arch/x86/mm/highmem_32.c | 3 +-- arch/xtensa/include/asm/highmem.h | 4 ++-- 11 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 1af00accb37f..042e92921c4c 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -45,7 +45,7 @@ static inline void flush_cache_kmaps(void) static inline void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index fc8849e4f72e..39ef7b9a3aa9 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -51,7 +51,7 @@ static pte_t * fixmap_page_table; void *kmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return page_address(page); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index a76f8ace9ce6..cc6eb79ef20c 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -42,7 +42,7 @@ EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 813129145f3d..690d678649d1 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -29,7 +29,7 @@ EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 332c78e15198..99ced7278b5c 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -66,7 +66,7 @@ static inline void *kmap(struct page *page) static inline void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index d08e6d7d533b..edd889f6cede 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -28,7 +28,7 @@ EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index 022779af6148..4c7c28e994ea 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -24,7 +24,7 @@ EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index a4b65b186ec6..529512f6d65a 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -74,7 +74,7 @@ static inline void *kmap(struct page *page) static inline void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 18d776925c45..7dd2d4b3f980 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -55,7 +55,7 @@ void kunmap_high(struct page *page); static inline void *kmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return page_address(page); return kmap_high(page); @@ -63,7 +63,7 @@ static inline void *kmap(struct page *page) static inline void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 0a1898b8552e..8af66382672b 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -15,8 +15,7 @@ EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { - if (in_interrupt()) - BUG(); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 04e9340eac4b..413848cc1e56 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -73,7 +73,7 @@ static inline void *kmap(struct page *page) */ BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return page_address(page); return kmap_high(page); @@ -81,7 +81,7 @@ static inline void *kmap(struct page *page) static inline void kunmap(struct page *page) { - BUG_ON(in_interrupt()); + might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); -- cgit v1.2.3 From 2159687248dda396ae5861b7551aeb0b5527da24 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:26 -0700 Subject: arch/xtensa: move kmap build bug out of the way MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the kmap() build bug to kmap_init() to facilitate patches to lift kmap() to the core. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-3-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/xtensa/include/asm/highmem.h | 5 ----- arch/xtensa/mm/highmem.c | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 413848cc1e56..a9587c85be85 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,11 +68,6 @@ void kunmap_high(struct page *page); static inline void *kmap(struct page *page) { - /* Check if this memory layout is broken because PKMAP overlaps - * page table. - */ - BUILD_BUG_ON(PKMAP_BASE < - TLBTEMP_BASE_1 + TLBTEMP_SIZE); might_sleep(); if (!PageHighMem(page)) return page_address(page); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 184ceadccc1a..da734a2ed641 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -88,6 +88,10 @@ void __init kmap_init(void) { unsigned long kmap_vstart; + /* Check if this memory layout is broken because PKMAP overlaps + * page table. + */ + BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); /* cache the first kmap pte */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); -- cgit v1.2.3 From 525aaf9bad00e7454b9f9b3873e92795afb59f8e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:30 -0700 Subject: arch/kmap: remove redundant arch specific kmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kmap code for all the architectures is almost 100% identical. Lift the common code to the core. Use ARCH_HAS_KMAP_FLUSH_TLB to indicate if an arch defines kmap_flush_tlb() and call if if needed. This also has the benefit of changing kmap() on a number of architectures to be an inline call rather than an actual function. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-4-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 -- arch/arc/mm/highmem.c | 10 ---------- arch/arm/include/asm/highmem.h | 2 -- arch/arm/mm/highmem.c | 9 --------- arch/csky/include/asm/highmem.h | 4 ++-- arch/csky/mm/highmem.c | 14 ++++---------- arch/microblaze/include/asm/highmem.h | 9 --------- arch/mips/include/asm/highmem.h | 4 ++-- arch/mips/mm/highmem.c | 14 +++----------- arch/nds32/include/asm/highmem.h | 2 -- arch/nds32/mm/highmem.c | 12 ------------ arch/powerpc/include/asm/highmem.h | 9 --------- arch/sparc/include/asm/highmem.h | 9 --------- arch/x86/include/asm/highmem.h | 2 -- arch/x86/mm/highmem_32.c | 9 --------- arch/xtensa/include/asm/highmem.h | 9 --------- include/linux/highmem.h | 18 ++++++++++++++++++ 17 files changed, 29 insertions(+), 109 deletions(-) diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 042e92921c4c..96eb67c86961 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,8 +30,6 @@ #include -extern void *kmap(struct page *page); -extern void *kmap_high(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void kunmap_high(struct page *page); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 39ef7b9a3aa9..4db13a6b9f3b 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,16 +49,6 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void *kmap_atomic(struct page *page) { int idx, cpu_idx; diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index eb4e4207cd3c..c917522541de 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -20,7 +20,6 @@ extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); /* @@ -63,7 +62,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index cc6eb79ef20c..e8ba37c36590 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,15 +31,6 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index a345a2f2c22e..9d0516e38110 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,10 +30,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap(struct page *page); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 690d678649d1..4a3c273bc8b9 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -13,18 +13,12 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } +EXPORT_SYMBOL(kmap_flush_tlb); + EXPORT_SYMBOL(kmap); void kunmap(struct page *page) diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 99ced7278b5c..8c5bfd228bd8 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,19 +51,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 9d84aafc33d0..1f741e3ecabf 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,10 +46,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap(struct page *page); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index edd889f6cede..c72058bfead6 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -12,19 +12,11 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); void kunmap(struct page *page) { diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b3a82c97ded3..b13654a79069 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -44,7 +44,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void kmap_init(void); @@ -54,7 +53,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index 4c7c28e994ea..d0cde53b84ae 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,18 +10,6 @@ #include #include -void *kmap(struct page *page) -{ - unsigned long vaddr; - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - vaddr = (unsigned long)kmap_high(page); - return (void *)vaddr; -} - -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 529512f6d65a..f14e4feef6d5 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,19 +59,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 7dd2d4b3f980..2ff1192047f7 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,17 +50,8 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_high(struct page *page); void kunmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a8059930056d..c916a28a9738 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,10 +58,8 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8af66382672b..12591a81b85c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,15 +4,6 @@ #include /* for totalram_pages */ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index a9587c85be85..2546b88ddecf 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,17 +63,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void *kmap_high(struct page *page); void kunmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ea5cdbd8c2c3..fc3adc51254a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -34,6 +34,24 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifdef CONFIG_HIGHMEM #include +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +void *kmap_high(struct page *page); +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; -- cgit v1.2.3 From e23c45976f82ac789469c37e4d5a72ea2ce30bba Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:34 -0700 Subject: arch/kunmap: remove duplicate kunmap implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures do exactly the same thing for kunmap(); remove all the duplicate definitions and lift the call to the core. This also has the benefit of changing kmap_unmap() on a number of architectures to be an inline call rather than an actual function. [akpm@linux-foundation.org: fix CONFIG_HIGHMEM=n build on various architectures] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-5-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 10 ---------- arch/arm/include/asm/highmem.h | 3 --- arch/arm/mm/highmem.c | 9 --------- arch/csky/include/asm/highmem.h | 3 --- arch/csky/mm/highmem.c | 9 --------- arch/microblaze/include/asm/highmem.h | 9 --------- arch/mips/include/asm/highmem.h | 3 --- arch/mips/mm/highmem.c | 9 --------- arch/nds32/include/asm/highmem.h | 3 --- arch/nds32/mm/highmem.c | 10 ---------- arch/powerpc/include/asm/highmem.h | 9 --------- arch/sparc/include/asm/highmem.h | 10 ---------- arch/x86/include/asm/highmem.h | 4 ---- arch/x86/mm/highmem_32.c | 9 --------- arch/xtensa/include/asm/highmem.h | 10 ---------- include/linux/highmem.h | 14 ++++++++++++++ 16 files changed, 14 insertions(+), 110 deletions(-) diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 96eb67c86961..8387a5596a91 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -32,7 +32,6 @@ extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); -extern void kunmap_high(struct page *page); extern void kmap_init(void); @@ -41,15 +40,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - - #endif #endif diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index c917522541de..736f65283e7b 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -20,8 +20,6 @@ extern pte_t *pkmap_page_table; -extern void kunmap_high(struct page *page); - /* * The reason for kmap_high_get() is to ensure that the currently kmap'd * page usage count does not decrease to zero while we're using its @@ -62,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index e8ba37c36590..c700b32350ee 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,15 +31,6 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned int idx; diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 9d0516e38110..be11c5b67122 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,11 +30,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 4a3c273bc8b9..e9952211264b 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,15 +21,6 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned long vaddr; diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 8c5bfd228bd8..0c94046f2d58 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,18 +51,9 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 1f741e3ecabf..24e7e7e5cc7b 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,11 +46,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index c72058bfead6..eb8ec8493f2f 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,15 +18,6 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b13654a79069..c93c7368bb3f 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -44,8 +44,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void kunmap_high(struct page *page); - extern void kmap_init(void); /* @@ -53,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d0cde53b84ae..f9348bec0ecb 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,16 +10,6 @@ #include #include -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned int idx; diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index f14e4feef6d5..ba3371977d49 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,18 +59,9 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 2ff1192047f7..4bdb79fed02c 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,16 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index c916a28a9738..90b96594d6c5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,10 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - -void kunmap(struct page *page); - void *kmap_atomic_prot(struct page *page, pgprot_t prot); void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 12591a81b85c..c4ebfd0ae401 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,15 +4,6 @@ #include /* for totalram_pages */ #include -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 2546b88ddecf..5a481f7def0b 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,16 +63,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void flush_cache_kmaps(void) { flush_cache_all(); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fc3adc51254a..216a647ed7db 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -52,6 +52,16 @@ static inline void *kmap(struct page *page) return addr; } +void kunmap_high(struct page *page); + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; @@ -102,6 +112,10 @@ static inline void *kmap(struct page *page) return page_address(page); } +static inline void kunmap_high(struct page *page) +{ +} + static inline void kunmap(struct page *page) { } -- cgit v1.2.3 From ee9bc5fdf5b6d24875fc55d43d5a0728bc2add21 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:38 -0700 Subject: {x86,powerpc,microblaze}/kmap: move preempt disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During this kmap() conversion series we must maintain bisect-ability. To do this, kmap_atomic_prot() in x86, powerpc, and microblaze need to remain functional. Create a temporary inline version of kmap_atomic_prot within these architectures so we can rework their kmap_atomic() calls and then lift kmap_atomic_prot() to the core. Suggested-by: Al Viro Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-6-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/highmem.h | 11 ++++++++++- arch/microblaze/mm/highmem.c | 10 ++-------- arch/powerpc/include/asm/highmem.h | 11 ++++++++++- arch/powerpc/mm/highmem.c | 9 ++------- arch/x86/include/asm/highmem.h | 11 ++++++++++- arch/x86/mm/highmem_32.c | 10 ++-------- 6 files changed, 36 insertions(+), 26 deletions(-) diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 0c94046f2d58..c38d920a1171 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,7 +51,16 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + return kmap_atomic_high_prot(page, prot); +} extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic(struct page *page) diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index d7569f77fa15..0e3efaa8a004 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -32,18 +32,12 @@ */ #include -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -55,7 +49,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index ba3371977d49..d049806a8354 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,7 +59,16 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + return kmap_atomic_high_prot(page, prot); +} extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic(struct page *page) diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 320c1672b2ae..f075cef6d663 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -30,16 +30,11 @@ * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -49,7 +44,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 90b96594d6c5..61f47fef40e5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,7 +58,16 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + return kmap_atomic_high_prot(page, prot); +} void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index c4ebfd0ae401..48b56b1af902 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -12,17 +12,11 @@ * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -32,7 +26,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); void *kmap_atomic(struct page *page) { -- cgit v1.2.3 From 78b6d91ec7bbfc5bcc2dd05bb2cf13c9de1dc7cd Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:42 -0700 Subject: arch/kmap_atomic: consolidate duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every arch has the same code to ensure atomic operations and a check for !HIGHMEM page. Remove the duplicate code by defining a core kmap_atomic() which only calls the arch specific kmap_atomic_high() when the page is high memory. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-7-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 1 - arch/arc/mm/highmem.c | 9 ++------- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 9 ++------- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 9 ++------- arch/microblaze/include/asm/highmem.h | 4 ++-- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/cache.c | 2 +- arch/mips/mm/highmem.c | 18 ++---------------- arch/nds32/include/asm/highmem.h | 1 - arch/nds32/mm/highmem.c | 11 ++--------- arch/powerpc/include/asm/highmem.h | 4 ++-- arch/powerpc/mm/highmem.c | 6 ------ arch/sparc/include/asm/highmem.h | 1 - arch/sparc/mm/highmem.c | 9 ++------- arch/x86/include/asm/highmem.h | 5 ++++- arch/x86/mm/highmem_32.c | 14 -------------- arch/xtensa/include/asm/highmem.h | 1 - arch/xtensa/mm/highmem.c | 9 ++------- include/linux/highmem.h | 23 +++++++++++++++++++++++ 21 files changed, 46 insertions(+), 93 deletions(-) diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 8387a5596a91..db425cd38545 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,7 +30,6 @@ #include -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void kmap_init(void); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 4db13a6b9f3b..0964b011c29f 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,16 +49,11 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { int idx, cpu_idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - cpu_idx = kmap_atomic_idx_push(); idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); vaddr = FIXMAP_ADDR(idx); @@ -68,7 +63,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kv) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 736f65283e7b..8c80bfe18a34 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -60,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index c700b32350ee..075fdc235091 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,18 +31,13 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - #ifdef CONFIG_DEBUG_HIGHMEM /* * There is no cache coherency issue when non VIVT, so force the @@ -76,7 +71,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index be11c5b67122..8ceee12f9bc1 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -32,7 +32,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index e9952211264b..63d74b47eee6 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,16 +21,11 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -42,7 +37,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index c38d920a1171..f7c5467df5ad 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -63,9 +63,9 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) } extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_high(struct page *page) { - return kmap_atomic_prot(page, kmap_prot); + return kmap_atomic_high_prot(page, kmap_prot); } #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 24e7e7e5cc7b..8bdbbfc322ad 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -48,7 +48,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index ad6df1cea866..295bfda7da97 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index eb8ec8493f2f..2bda56372995 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,25 +18,11 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -48,7 +34,7 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index c93c7368bb3f..a3970e566ede 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -51,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index f9348bec0ecb..d4387d835870 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,18 +10,13 @@ #include #include -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned int idx; unsigned long vaddr, pte; int type; pte_t *ptep; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); @@ -36,8 +31,7 @@ void *kmap_atomic(struct page *page) __nds32__isb(); return (void *)vaddr; } - -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { @@ -53,5 +47,4 @@ void __kunmap_atomic(void *kvaddr) pagefault_enable(); preempt_enable(); } - EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index d049806a8354..74fa2c726fde 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -71,9 +71,9 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) } extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_high(struct page *page) { - return kmap_atomic_prot(page, kmap_prot); + return kmap_atomic_high_prot(page, kmap_prot); } diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index f075cef6d663..67aaa5217f7f 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -24,12 +24,6 @@ #include #include -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 4bdb79fed02c..458210c5bc38 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,7 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index d4a80adea7e5..b53070ab6a31 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -53,16 +53,11 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; long idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -87,7 +82,7 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 61f47fef40e5..9393d55a2adb 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -68,7 +68,10 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -void *kmap_atomic(struct page *page); +static inline void *kmap_atomic_high(struct page *page) +{ + return kmap_atomic_high_prot(page, kmap_prot); +} void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 48b56b1af902..c3e272a759e0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,14 +4,6 @@ #include /* for totalram_pages */ #include -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; @@ -28,12 +20,6 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); - /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 5a481f7def0b..1e6aa15c4bdf 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,7 +68,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void kmap_init(void); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index da734a2ed641..90b85a897cb0 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,16 +37,11 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { enum fixed_addresses idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - idx = kmap_idx(kmap_atomic_idx_push(), DCACHE_ALIAS(page_to_phys(page))); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -57,7 +52,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 216a647ed7db..d2209ae8be99 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,6 +32,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include #ifdef CONFIG_HIGHMEM +extern void *kmap_atomic_high(struct page *page); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -62,6 +63,28 @@ static inline void kunmap(struct page *page) kunmap_high(page); } +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + * + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +static inline void *kmap_atomic(struct page *page) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_atomic_high(page); +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; -- cgit v1.2.3 From abca2500c0c1b20c3e552f259da4c4a99db3b4d1 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:46 -0700 Subject: arch/kunmap_atomic: consolidate duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every single architecture (including !CONFIG_HIGHMEM) calls... pagefault_enable(); preempt_enable(); ... before returning from __kunmap_atomic(). Lift this code into the kunmap_atomic() macro. While we are at it rename __kunmap_atomic() to kunmap_atomic_high() to be consistent. [ira.weiny@intel.com: don't enable pagefault/preempt twice] Link: http://lkml.kernel.org/r/20200518184843.3029640-1-ira.weiny@intel.com [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20200507150004.1423069-8-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 -- arch/arc/mm/highmem.c | 7 ++----- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 6 ++---- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 9 +++------ arch/microblaze/include/asm/highmem.h | 1 - arch/microblaze/mm/highmem.c | 11 +++-------- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/cache.c | 4 ++-- arch/mips/mm/highmem.c | 11 +++-------- arch/nds32/include/asm/highmem.h | 1 - arch/nds32/mm/highmem.c | 6 ++---- arch/parisc/include/asm/cacheflush.h | 4 +--- arch/powerpc/include/asm/highmem.h | 1 - arch/powerpc/mm/highmem.c | 11 +++-------- arch/sparc/include/asm/highmem.h | 2 -- arch/sparc/mm/highmem.c | 11 +++-------- arch/x86/include/asm/highmem.h | 1 - arch/x86/mm/highmem_32.c | 7 ++----- arch/xtensa/include/asm/highmem.h | 2 -- arch/xtensa/mm/highmem.c | 7 ++----- include/linux/highmem.h | 13 +++++++++---- 23 files changed, 37 insertions(+), 83 deletions(-) diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index db425cd38545..70900a73bfc8 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,8 +30,6 @@ #include -extern void __kunmap_atomic(void *kvaddr); - extern void kmap_init(void); static inline void flush_cache_kmaps(void) diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 0964b011c29f..5d3eab4ac0b0 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -65,7 +65,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kv) +void kunmap_atomic_high(void *kv) { unsigned long kvaddr = (unsigned long)kv; @@ -87,11 +87,8 @@ void __kunmap_atomic(void *kv) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 8c80bfe18a34..b0d4bd8dc3c1 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -60,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 075fdc235091..ac8394655a6e 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -73,7 +73,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx, type; @@ -95,10 +95,8 @@ void __kunmap_atomic(void *kvaddr) /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void *kmap_atomic_pfn(unsigned long pfn) { diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 8ceee12f9bc1..263fbddcd0a3 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -32,7 +32,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 63d74b47eee6..0aafbbbe651c 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -39,13 +39,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx; if (vaddr < FIXADDR_START) - goto out; + return; #ifdef CONFIG_DEBUG_HIGHMEM idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); @@ -58,11 +58,8 @@ void __kunmap_atomic(void *kvaddr) (void) idx; /* to kill a warning */ #endif kmap_atomic_idx_pop(); -out: - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index f7c5467df5ad..c3cbda90391d 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -61,7 +61,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic_high(struct page *page) { diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index 0e3efaa8a004..92e0890416c9 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -51,17 +51,14 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; unsigned int idx; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } type = kmap_atomic_idx(); @@ -77,7 +74,5 @@ void __kunmap_atomic(void *kvaddr) local_flush_tlb_page(NULL, vaddr); kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 8bdbbfc322ad..76dec0bd4f59 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -48,7 +48,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 295bfda7da97..3e81ba000096 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -103,7 +103,7 @@ void __flush_dcache_page(struct page *page) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); } EXPORT_SYMBOL(__flush_dcache_page); @@ -146,7 +146,7 @@ void __update_cache(unsigned long address, pte_t pte) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); ClearPageDcacheDirty(page); } diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 2bda56372995..73ef4004fe5f 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -36,16 +36,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type __maybe_unused; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); #ifdef CONFIG_DEBUG_HIGHMEM @@ -63,10 +60,8 @@ void __kunmap_atomic(void *kvaddr) } #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index a3970e566ede..4d21308549c9 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -51,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); #endif diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d4387d835870..d25c815fda21 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -33,7 +33,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START) { unsigned long vaddr = (unsigned long)kvaddr; @@ -44,7 +44,5 @@ void __kunmap_atomic(void *kvaddr) ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, 0); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 0c83644bfa5c..119c9a7681bc 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -122,11 +122,9 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { flush_kernel_dcache_page_addr(addr); - pagefault_enable(); - preempt_enable(); } #define kmap_atomic_prot(page, prot) kmap_atomic(page) diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 74fa2c726fde..373a470df205 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -69,7 +69,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic_high(struct page *page) { diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 67aaa5217f7f..624b4438aff9 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -40,15 +40,12 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { int type = kmap_atomic_idx(); @@ -66,7 +63,5 @@ void __kunmap_atomic(void *kvaddr) } kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 458210c5bc38..f4babe67cb5d 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,8 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void __kunmap_atomic(void *kvaddr); - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index b53070ab6a31..06798ae813b9 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -84,16 +84,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); @@ -126,7 +123,5 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 9393d55a2adb..be66b77885a0 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -72,7 +72,6 @@ static inline void *kmap_atomic_high(struct page *page) { return kmap_atomic_high_prot(page, kmap_prot); } -void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index c3e272a759e0..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -30,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -60,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 1e6aa15c4bdf..d6a10704307a 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,8 +68,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -void __kunmap_atomic(void *kvaddr); - void kmap_init(void); #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 90b85a897cb0..4de323e43682 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -54,7 +54,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START && kvaddr < (void *)FIXADDR_TOP) { @@ -73,11 +73,8 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d2209ae8be99..945b58d8a57b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -33,6 +33,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifdef CONFIG_HIGHMEM extern void *kmap_atomic_high(struct page *page); +extern void kunmap_atomic_high(void *kvaddr); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -151,10 +152,12 @@ static inline void *kmap_atomic(struct page *page) } #define kmap_atomic_prot(page, prot) kmap_atomic(page) -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { - pagefault_enable(); - preempt_enable(); + /* + * Nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * handles re-enabling faults + preemption + */ } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) @@ -204,7 +207,9 @@ static inline void kmap_atomic_idx_pop(void) #define kunmap_atomic(addr) \ do { \ BUILD_BUG_ON(__same_type((addr), struct page *)); \ - __kunmap_atomic(addr); \ + kunmap_atomic_high(addr); \ + pagefault_enable(); \ + preempt_enable(); \ } while (0) -- cgit v1.2.3 From db458d73fa35e256bba90b59a6776810800c8bb6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:50 -0700 Subject: arch/kmap: ensure kmap_prot visibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to support kmap_atomic_prot() on all architectures and it makes sense to define kmap_atomic() to use the default kmap_prot. So we ensure all arch's have a globally available kmap_prot either as a define or exported symbol. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-9-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/highmem.h | 2 +- arch/microblaze/mm/init.c | 3 --- arch/powerpc/include/asm/highmem.h | 2 +- arch/powerpc/mm/mem.c | 3 --- arch/sparc/mm/highmem.c | 1 + 5 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index c3cbda90391d..90d96239152f 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -25,8 +25,8 @@ #include #include +#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; /* diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index dcaa53d11339..d943f69784b1 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -49,8 +49,6 @@ unsigned long lowmem_size; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); static inline pte_t *virt_to_kpte(unsigned long vaddr) { @@ -68,7 +66,6 @@ static void __init highmem_init(void) pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; } static void highmem_setup(void) diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 373a470df205..ee5de974c5ef 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -29,8 +29,8 @@ #include #include +#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; /* diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0fcea21f26b4..7cebb9c818d3 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -64,8 +64,6 @@ bool init_mem_is_free; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); #endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -245,7 +243,6 @@ void __init paging_init(void) pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; #endif /* CONFIG_HIGHMEM */ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 06798ae813b9..02e23a677c58 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -33,6 +33,7 @@ #include pgprot_t kmap_prot; +EXPORT_SYMBOL(kmap_prot); static pte_t *kmap_pte; -- cgit v1.2.3 From d8c25836fa16496392bfa6a43908d231dd41bfc6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:54 -0700 Subject: arch/kmap: don't hard code kmap_prot values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support kmap_atomic_prot() on all architectures each arch must support protections passed in to them. Change csky, mips, nds32 and xtensa to use their global constant kmap_prot rather than a hard coded value which was equal. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-10-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/csky/mm/highmem.c | 2 +- arch/mips/mm/highmem.c | 2 +- arch/nds32/mm/highmem.c | 2 +- arch/xtensa/mm/highmem.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 0aafbbbe651c..f4311669b5bb 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -32,7 +32,7 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); flush_tlb_one((unsigned long)vaddr); return (void *)vaddr; diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 73ef4004fe5f..96a486777a18 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -29,7 +29,7 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); local_flush_tlb_one((unsigned long)vaddr); return (void*) vaddr; diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d25c815fda21..b11b88956353 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -21,7 +21,7 @@ void *kmap_atomic_high(struct page *page) idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | (PAGE_KERNEL); + pte = (page_to_pfn(page) << PAGE_SHIFT) | (kmap_prot); ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, pte); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 4de323e43682..50168b09510a 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -48,7 +48,7 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte + idx))); #endif - set_pte(kmap_pte + idx, mk_pte(page, PAGE_KERNEL_EXEC)); + set_pte(kmap_pte + idx, mk_pte(page, kmap_prot)); return (void *)vaddr; } -- cgit v1.2.3 From 20b271dfe9d932b02b067a1f7ba9805c5b8d79bd Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:58 -0700 Subject: arch/kmap: define kmap_atomic_prot() for all arch's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support kmap_atomic_prot(), all architectures need to support protections passed to their kmap_atomic_high() function. Pass protections into kmap_atomic_high() and change the name to kmap_atomic_high_prot() to match. Then define kmap_atomic_prot() as a core function which calls kmap_atomic_high_prot() when needed. Finally, redefine kmap_atomic() as a wrapper of kmap_atomic_prot() with the default kmap_prot exported by the architectures. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-11-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/mm/highmem.c | 6 +++--- arch/arm/mm/highmem.c | 6 +++--- arch/csky/mm/highmem.c | 6 +++--- arch/microblaze/include/asm/highmem.h | 16 ---------------- arch/mips/mm/highmem.c | 6 +++--- arch/nds32/mm/highmem.c | 6 +++--- arch/powerpc/include/asm/highmem.h | 17 ----------------- arch/sparc/mm/highmem.c | 6 +++--- arch/x86/include/asm/highmem.h | 14 -------------- arch/xtensa/mm/highmem.c | 6 +++--- include/linux/highmem.h | 7 ++++--- 11 files changed, 25 insertions(+), 71 deletions(-) diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 5d3eab4ac0b0..479b0d72d3cf 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,7 +49,7 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { int idx, cpu_idx; unsigned long vaddr; @@ -59,11 +59,11 @@ void *kmap_atomic_high(struct page *page) vaddr = FIXMAP_ADDR(idx); set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, kmap_prot)); + mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kv) { diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index ac8394655a6e..e013f6b81328 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,7 +31,7 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr; @@ -67,11 +67,11 @@ void *kmap_atomic_high(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); + set_fixmap_pte(idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index f4311669b5bb..3ae5c8cd7619 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,7 +21,7 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; @@ -32,12 +32,12 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); flush_tlb_one((unsigned long)vaddr); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 90d96239152f..d7c55cfd27bd 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,22 +51,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} - -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} - #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } #endif /* __KERNEL__ */ diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 96a486777a18..8e8726992720 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,7 +18,7 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; @@ -29,12 +29,12 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); local_flush_tlb_one((unsigned long)vaddr); return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index b11b88956353..4284cd59e21a 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,7 +10,7 @@ #include #include -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr, pte; @@ -21,7 +21,7 @@ void *kmap_atomic_high(struct page *page) idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | (kmap_prot); + pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, pte); @@ -31,7 +31,7 @@ void *kmap_atomic_high(struct page *page) __nds32__isb(); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index ee5de974c5ef..8d8ee3fcd800 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,23 +59,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} - -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} - - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 02e23a677c58..9309bcab4ae6 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -54,7 +54,7 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; long idx, type; @@ -73,7 +73,7 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte-idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); /* XXX Fix - Anton */ #if 0 __flush_tlb_one(vaddr); @@ -83,7 +83,7 @@ void *kmap_atomic_high(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index be66b77885a0..0f420b24e0fc 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,20 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 50168b09510a..99b5ad137ab5 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,7 +37,7 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -48,11 +48,11 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte + idx))); #endif - set_pte(kmap_pte + idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte + idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 945b58d8a57b..9c559c670299 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,7 +32,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_high(struct page *page); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); extern void kunmap_atomic_high(void *kvaddr); #include @@ -77,14 +77,15 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - return kmap_atomic_high(page); + return kmap_atomic_high_prot(page, prot); } +#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -- cgit v1.2.3 From 915ecc22d5b20fc936b91d3678b267a96b352c12 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:02 -0700 Subject: drm: remove drm specific kmap_atomic code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kmap_atomic_prot() is now exported by all architectures. Use this function rather than open coding a driver specific kmap_atomic. [arnd@arndb.de: include linux/highmem.h] Link: http://lkml.kernel.org/r/20200508220150.649044-1-arnd@arndb.de Signed-off-by: Ira Weiny Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Reviewed-by: Christian König Reviewed-by: Christoph Hellwig Acked-by: Daniel Vetter Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Zankel Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-12-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- drivers/gpu/drm/ttm/ttm_bo_util.c | 56 +++--------------------------------- drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 17 +++++------ include/drm/ttm/ttm_bo_api.h | 4 --- 3 files changed, 13 insertions(+), 64 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 52d2b71f1588..f09b096ba4fd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, unsigned long page) return 0; } -#ifdef CONFIG_X86 -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, __prot) -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr) -#else -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0, __prot) -#define __ttm_kunmap_atomic(__addr) vunmap(__addr) -#endif - - -/** - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with - * specified page protection. - * - * @page: The page to map. - * @prot: The page protection. - * - * This function maps a TTM page using the kmap_atomic api if available, - * otherwise falls back to vmap. The user must make sure that the - * specified page does not have an aliased mapping with a different caching - * policy unless the architecture explicitly allows it. Also mapping and - * unmapping using this api must be correctly nested. Unmapping should - * occur in the reverse order of mapping. - */ -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) - return kmap_atomic(page); - else - return __ttm_kmap_atomic_prot(page, prot); -} -EXPORT_SYMBOL(ttm_kmap_atomic_prot); - -/** - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using - * ttm_kmap_atomic_prot. - * - * @addr: The virtual address from the map. - * @prot: The page protection. - */ -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot) -{ - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) - kunmap_atomic(addr); - else - __ttm_kunmap_atomic(addr); -} -EXPORT_SYMBOL(ttm_kunmap_atomic_prot); - static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, unsigned long page, pgprot_t prot) @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, return -ENOMEM; src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); - dst = ttm_kmap_atomic_prot(d, prot); + dst = kmap_atomic_prot(d, prot); if (!dst) return -ENOMEM; memcpy_fromio(dst, src, PAGE_SIZE); - ttm_kunmap_atomic_prot(dst, prot); + kunmap_atomic(dst); return 0; } @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, return -ENOMEM; dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); - src = ttm_kmap_atomic_prot(s, prot); + src = kmap_atomic_prot(s, prot); if (!src) return -ENOMEM; memcpy_toio(dst, src, PAGE_SIZE); - ttm_kunmap_atomic_prot(src, prot); + kunmap_atomic(src); return 0; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index bb46ca0c458f..1629427d5734 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -27,6 +27,7 @@ **************************************************************************/ #include "vmwgfx_drv.h" +#include /* * Template that implements find_first_diff() for a generic @@ -374,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); if (unmap_src) { - ttm_kunmap_atomic_prot(d->src_addr, d->src_prot); + kunmap_atomic(d->src_addr); d->src_addr = NULL; } if (unmap_dst) { - ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot); + kunmap_atomic(d->dst_addr); d->dst_addr = NULL; } @@ -388,8 +389,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return -EINVAL; d->dst_addr = - ttm_kmap_atomic_prot(d->dst_pages[dst_page], - d->dst_prot); + kmap_atomic_prot(d->dst_pages[dst_page], + d->dst_prot); if (!d->dst_addr) return -ENOMEM; @@ -401,8 +402,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return -EINVAL; d->src_addr = - ttm_kmap_atomic_prot(d->src_pages[src_page], - d->src_prot); + kmap_atomic_prot(d->src_pages[src_page], + d->src_prot); if (!d->src_addr) return -ENOMEM; @@ -499,9 +500,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, } out: if (d.src_addr) - ttm_kunmap_atomic_prot(d.src_addr, d.src_prot); + kunmap_atomic(d.src_addr); if (d.dst_addr) - ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot); + kunmap_atomic(d.dst_addr); return ret; } diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 0a9d042e075a..de1ccdcd5703 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -668,10 +668,6 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo); int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, struct ttm_bo_device *bdev); -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot); - -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot); - /** * ttm_bo_io * -- cgit v1.2.3 From 8bfb1a10f2bb621a2c4c8bf7671b163f7e20d332 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:06 -0700 Subject: kmap: remove kmap_atomic_to_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kmap_atomic_to_page() has no callers and is only defined on 1 arch and declared on another. Remove it. Suggested-by: Al Viro Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Christoph Hellwig Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-13-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 13 ------------- arch/nds32/include/asm/highmem.h | 1 - 3 files changed, 15 deletions(-) diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 263fbddcd0a3..ea2f3f39174d 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -33,7 +33,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do {} while (0) diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 3ae5c8cd7619..3b3f622f5ae9 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -81,19 +81,6 @@ void *kmap_atomic_pfn(unsigned long pfn) return (void *) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - static void __init kmap_pages_init(void) { unsigned long vaddr; diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index 4d21308549c9..a48a6536d41a 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -52,7 +52,6 @@ extern void kmap_init(void); */ #ifdef CONFIG_HIGHMEM extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #endif #endif -- cgit v1.2.3 From 7438f36310ddd9fe536fc7403187f63427cecaba Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:10 -0700 Subject: parisc/kmap: remove duplicate kmap code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parisc reimplements the kmap calls except to flush its dcache. This is arguably an abuse of kmap but regardless it is messy and confusing. Remove the duplicate code and have parisc define ARCH_HAS_FLUSH_ON_KUNMAP for a kunmap_flush_on_unmap() architecture specific call to flush the cache. Suggested-by: Al Viro Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Christoph Hellwig Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-14-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/cacheflush.h | 28 ++-------------------------- include/linux/highmem.h | 10 +++++++--- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 119c9a7681bc..99663fc1f997 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -100,35 +100,11 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } } -#include - -#define ARCH_HAS_KMAP - -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} - -static inline void kunmap(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} - -static inline void *kmap_atomic(struct page *page) -{ - preempt_disable(); - pagefault_disable(); - return page_address(page); -} - -static inline void kunmap_atomic_high(void *addr) +#define ARCH_HAS_FLUSH_ON_KUNMAP +static inline void kunmap_flush_on_unmap(void *addr) { flush_kernel_dcache_page_addr(addr); } -#define kmap_atomic_prot(page, prot) kmap_atomic(page) -#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) - #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 9c559c670299..091b32dff2d1 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -130,7 +130,6 @@ static inline struct page *kmap_to_page(void *addr) static inline unsigned long totalhigh_pages(void) { return 0UL; } -#ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { might_sleep(); @@ -143,6 +142,9 @@ static inline void kunmap_high(struct page *page) static inline void kunmap(struct page *page) { +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(page_address(page)); +#endif } static inline void *kmap_atomic(struct page *page) @@ -156,15 +158,17 @@ static inline void *kmap_atomic(struct page *page) static inline void kunmap_atomic_high(void *addr) { /* - * Nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() * handles re-enabling faults + preemption */ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) #define kmap_flush_unused() do {} while(0) -#endif #endif /* CONFIG_HIGHMEM */ -- cgit v1.2.3 From db6f1785f1c2462c388c516b0dacb980cf65012c Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:14 -0700 Subject: sparc: remove unnecessary includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit linux/highmem.h has not been needed for the pte_offset_map => kmap_atomic use in sparc for some time (~2002) Remove this include. Suggested-by: Al Viro Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Christoph Hellwig Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-15-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/sparc/mm/io-unit.c | 1 - arch/sparc/mm/iommu.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 289276b99b01..08238d989cfd 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -10,7 +10,6 @@ #include #include #include -#include /* pte_offset_map => kmap_atomic */ #include #include #include diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index b00dde13681b..f1e08e30b64e 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -12,7 +12,6 @@ #include #include #include -#include /* pte_offset_map => kmap_atomic */ #include #include #include -- cgit v1.2.3 From 090e77e166334b83f555de408df64b9ab394ea08 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:18 -0700 Subject: kmap: consolidate kmap_prot definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most architectures define kmap_prot to be PAGE_KERNEL. Let sparc and xtensa define there own and define PAGE_KERNEL as the default if not overridden. [akpm@linux-foundation.org: coding style fixes] Suggested-by: Christoph Hellwig Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-16-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 3 --- arch/arm/include/asm/highmem.h | 2 -- arch/csky/include/asm/highmem.h | 2 -- arch/microblaze/include/asm/highmem.h | 1 - arch/mips/include/asm/highmem.h | 2 -- arch/nds32/include/asm/highmem.h | 1 - arch/powerpc/include/asm/highmem.h | 1 - arch/sparc/include/asm/highmem.h | 3 ++- arch/sparc/mm/highmem.c | 4 ---- arch/x86/include/asm/fixmap.h | 1 - include/linux/highmem.h | 4 ++++ 11 files changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 70900a73bfc8..6e5eafb3afdd 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -25,9 +25,6 @@ #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) -#define kmap_prot PAGE_KERNEL - - #include extern void kmap_init(void); diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index b0d4bd8dc3c1..31811be38d78 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -10,8 +10,6 @@ #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL - #define flush_cache_kmaps() \ do { \ if (cache_is_vivt()) \ diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index ea2f3f39174d..14645e3d5cd5 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -38,8 +38,6 @@ extern void *kmap_atomic_pfn(unsigned long pfn); extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* __ASM_CSKY_HIGHMEM_H */ diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index d7c55cfd27bd..284ca8fb54c1 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -25,7 +25,6 @@ #include #include -#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 76dec0bd4f59..f1f788b57166 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -54,8 +54,6 @@ extern void *kmap_atomic_pfn(unsigned long pfn); extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index a48a6536d41a..5717647d14d1 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -32,7 +32,6 @@ #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL static inline void flush_cache_kmaps(void) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 8d8ee3fcd800..104026f7d6bc 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -29,7 +29,6 @@ #include #include -#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index f4babe67cb5d..ddb03c04f1f3 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -25,11 +25,12 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pgprot_t kmap_prot; +#define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; void kmap_init(void) __init; diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 9309bcab4ae6..6ff6e2a9f9b3 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -32,9 +32,6 @@ #include #include -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); - static pte_t *kmap_pte; void __init kmap_init(void) @@ -51,7 +48,6 @@ void __init kmap_init(void) /* cache the first kmap pte */ kmap_pte = pte_offset_kernel(dir, address); - kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 28183ee3cc42..b9527a54db99 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 091b32dff2d1..d6e82e3de027 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -40,6 +40,10 @@ extern void kunmap_atomic_high(void *kvaddr); static inline void kmap_flush_tlb(unsigned long addr) { } #endif +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + void *kmap_high(struct page *page); static inline void *kmap(struct page *page) { -- cgit v1.2.3 From d4eaa2837851db2bfed572898bfc17f9a9f9151e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Jun 2020 16:48:21 -0700 Subject: mm: add kvfree_sensitive() for freeing sensitive data objects For kvmalloc'ed data object that contains sensitive information like cryptographic keys, we need to make sure that the buffer is always cleared before freeing it. Using memset() alone for buffer clearing may not provide certainty as the compiler may compile it away. To be sure, the special memzero_explicit() has to be used. This patch introduces a new kvfree_sensitive() for freeing those sensitive data objects allocated by kvmalloc(). The relevant places where kvfree_sensitive() can be used are modified to use it. Fixes: 4f0882491a14 ("KEYS: Avoid false positive ENOMEM error on key read") Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Signed-off-by: Andrew Morton Reviewed-by: Eric Biggers Acked-by: David Howells Cc: Jarkko Sakkinen Cc: James Morris Cc: "Serge E. Hallyn" Cc: Joe Perches Cc: Matthew Wilcox Cc: David Rientjes Cc: Uladzislau Rezki Link: http://lkml.kernel.org/r/20200407200318.11711-1-longman@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/util.c | 18 ++++++++++++++++++ security/keys/internal.h | 11 ----------- security/keys/keyctl.c | 16 +++++----------- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e220ce5185ad..5bfc36320e3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -776,6 +776,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) } extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); /* * Mapcount of compound page as a whole, does not include mapped sub-pages. diff --git a/mm/util.c b/mm/util.c index fd9efe6bd463..cd62e6fb5318 100644 --- a/mm/util.c +++ b/mm/util.c @@ -604,6 +604,24 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; diff --git a/security/keys/internal.h b/security/keys/internal.h index 6d0ca48ae9a5..153d35c20d3d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -350,15 +350,4 @@ static inline void key_check(const struct key *key) #define key_check(key) do {} while(0) #endif - -/* - * Helper function to clear and free a kvmalloc'ed memory object. - */ -static inline void __kvzfree(const void *addr, size_t len) -{ - if (addr) { - memset((void *)addr, 0, len); - kvfree(addr); - } -} #endif /* _INTERNAL_H */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5e01192e222a..edde63a63007 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -142,10 +142,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, key_ref_put(keyring_ref); error3: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error2: kfree(description); error: @@ -360,7 +357,7 @@ long keyctl_update_key(key_serial_t id, key_ref_put(key_ref); error2: - __kvzfree(payload, plen); + kvfree_sensitive(payload, plen); error: return ret; } @@ -914,7 +911,7 @@ can_read_key: */ if (ret > key_data_len) { if (unlikely(key_data)) - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_data_len = ret; continue; /* Allocate buffer */ } @@ -923,7 +920,7 @@ can_read_key: ret = -EFAULT; break; } - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_put_out: key_put(key); @@ -1225,10 +1222,7 @@ long keyctl_instantiate_key_common(key_serial_t id, keyctl_change_reqkey_auth(NULL); error2: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error: return ret; } -- cgit v1.2.3 From fa6d9ec790550b758215b6c6fa9f940878c3e2a2 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 4 Jun 2020 16:48:25 -0700 Subject: mm/memory_hotplug: refrain from adding memory into an impossible node A misbehaving qemu created a situation where the ACPI SRAT table advertised one fewer proximity domains than intended. The NFIT table did describe all the expected proximity domains. This caused the device dax driver to assign an impossible target_node to the device, and when hotplugged as system memory, this would fail with the following signature: BUG: kernel NULL pointer dereference, address: 0000000000000088 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 80000001767d4067 P4D 80000001767d4067 PUD 10e0c4067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 4 PID: 22737 Comm: kswapd3 Tainted: G O 5.6.0-rc5 #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:prepare_kswapd_sleep+0x7c/0xc0 Code: 89 df e8 87 fd ff ff 89 c2 31 c0 84 d2 74 e6 0f 1f 44 00 00 48 8b 05 fb af 7a 01 48 63 93 88 1d 01 00 48 8b 84 d0 20 0f 00 00 <48> 3b 98 88 00 00 00 75 28 f0 80 a0 80 00 00 00 fe f0 80 a3 38 20 RSP: 0018:ffffc900017a3e78 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff8881209e0000 RCX: 0000000000000000 RDX: 0000000000000003 RSI: 0000000000000000 RDI: ffff8881209e0e80 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000008000 R10: 0000000000000000 R11: 0000000000000003 R12: 0000000000000003 R13: 0000000000000003 R14: 0000000000000000 R15: ffffc900017a3ec8 FS: 0000000000000000(0000) GS:ffff888318c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000088 CR3: 0000000120b50002 CR4: 00000000001606e0 Call Trace: kswapd+0x103/0x520 kthread+0x120/0x140 ret_from_fork+0x3a/0x50 Add a check in the add_memory path to fail if the node to which we are adding memory is in the node_possible_map Signed-off-by: Vishal Verma Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Dan Williams Cc: Dave Hansen Link: http://lkml.kernel.org/r/20200416225438.15208-1-vishal.l.verma@intel.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 926ec704e835..8907426e44d9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1017,6 +1017,11 @@ int __ref add_memory_resource(int nid, struct resource *res) if (ret) return ret; + if (!node_possible(nid)) { + WARN(1, "node %d was absent from the node_possible_map\n", nid); + return -EINVAL; + } + mem_hotplug_begin(); /* -- cgit v1.2.3 From ef1b51f7735e0988ea6bf99fd6eec4698e965b91 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:28 -0700 Subject: powerpc/pseries/hotplug-memory: stop checking is_mem_section_removable() In commit 53cdc1cb29e8 ("drivers/base/memory.c: indicate all memory blocks as removable"), the user space interface to compute whether a memory block can be offlined (exposed via /sys/devices/system/memory/memoryX/removable) has effectively been deprecated. We want to remove the leftovers of the kernel implementation. When offlining a memory block (mm/memory_hotplug.c:__offline_pages()), we'll start by: 1. Testing if it contains any holes, and reject if so 2. Testing if pages belong to different zones, and reject if so 3. Isolating the page range, checking if it contains any unmovable pages Using is_mem_section_removable() before trying to offline is not only racy, it can easily result in false positives/negatives. Let's stop manually checking is_mem_section_removable(), and let device_offline() handle it completely instead. We can remove the racy is_mem_section_removable() implementation next. We now take more locks (e.g., memory hotplug lock when offlining and the zone lock when isolating), but maybe we should optimize that implementation instead if this ever becomes a real problem (after all, memory unplug is already an expensive operation). We started using is_mem_section_removable() in commit 51925fb3c5c9 ("powerpc/pseries: Implement memory hotplug remove in the kernel"), with the initial hotremove support of lmbs. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Nathan Fontenot Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michal Hocko Cc: Oscar Salvador Cc: Baoquan He Cc: Wei Yang Link: http://lkml.kernel.org/r/20200407135416.24093-2-david@redhat.com Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/pseries/hotplug-memory.c | 26 +++---------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index b2cde1732301..5ace2f9a277e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -337,39 +337,19 @@ static int pseries_remove_mem_node(struct device_node *np) static bool lmb_is_removable(struct drmem_lmb *lmb) { - int i, scns_per_block; - bool rc = true; - unsigned long pfn, block_sz; - u64 phys_addr; - if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) return false; - block_sz = memory_block_size_bytes(); - scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - phys_addr = lmb->base_addr; - #ifdef CONFIG_FA_DUMP /* * Don't hot-remove memory that falls in fadump boot memory area * and memory that is reserved for capturing old kernel memory. */ - if (is_fadump_memory_area(phys_addr, block_sz)) + if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes())) return false; #endif - - for (i = 0; i < scns_per_block; i++) { - pfn = PFN_DOWN(phys_addr); - if (!pfn_in_present_section(pfn)) { - phys_addr += MIN_MEMORY_BLOCK_SIZE; - continue; - } - - rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); - phys_addr += MIN_MEMORY_BLOCK_SIZE; - } - - return rc; + /* device_offline() will determine if we can actually remove this lmb */ + return true; } static int dlpar_add_lmb(struct drmem_lmb *); -- cgit v1.2.3 From 04f3465c98665b7c5a3484d7194f1858954069f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:31 -0700 Subject: mm/memory_hotplug: remove is_mem_section_removable() Fortunately, all users of is_mem_section_removable() are gone. Get rid of it, including some now unnecessary functions. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Oscar Salvador Link: http://lkml.kernel.org/r/20200407135416.24093-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 7 ---- mm/memory_hotplug.c | 75 ------------------------------------------ 2 files changed, 82 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 93d9ada74ddd..7dca9cd6076b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -314,19 +314,12 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else -static inline bool is_mem_section_removable(unsigned long pfn, - unsigned long nr_pages) -{ - return false; -} - static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8907426e44d9..bfe8cd2a685f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1113,81 +1113,6 @@ int add_memory(int nid, u64 start, u64 size) EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE -/* - * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy - * set and the size of the free page is given by page_order(). Using this, - * the function determines if the pageblock contains only free pages. - * Due to buddy contraints, a free page at least the size of a pageblock will - * be located at the start of the pageblock - */ -static inline int pageblock_free(struct page *page) -{ - return PageBuddy(page) && page_order(page) >= pageblock_order; -} - -/* Return the pfn of the start of the next active pageblock after a given pfn */ -static unsigned long next_active_pageblock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - - /* Ensure the starting page is pageblock-aligned */ - BUG_ON(pfn & (pageblock_nr_pages - 1)); - - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) { - int order; - /* be careful. we don't have locks, page_order can be changed.*/ - order = page_order(page); - if ((order < MAX_ORDER) && (order >= pageblock_order)) - return pfn + (1 << order); - } - - return pfn + pageblock_nr_pages; -} - -static bool is_pageblock_removable_nolock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - struct zone *zone; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; - - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); - - /* Check the starting page of each pageblock within the range */ - for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) - return false; - cond_resched(); - } - - /* All pageblocks in the memory block are likely to be hot-removable */ - return true; -} - /* * Confirm all pages in a range [start, end) belong to the same zone (skipping * memory holes). When true, return the zone. -- cgit v1.2.3 From c68ab18c6aee0397574afb418f6775f23379198e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:35 -0700 Subject: mm/memory_hotplug: set node_start_pfn of hotadded pgdat to 0 Patch series "mm/memory_hotplug: handle memblocks only with CONFIG_ARCH_KEEP_MEMBLOCK", v1. A hotadded node/pgdat will span no pages at all, until memory is moved to the zone/node via move_pfn_range_to_zone() -> resize_pgdat_range - e.g., when onlining memory blocks. We don't have to initialize the node_start_pfn to the memory we are adding. This patch (of 2): Especially, there is an inconsistency: - Hotplugging memory to a memory-less node with cpus: node_start_pf == 0 - Offlining and removing last memory from a node: node_start_pfn == 0 - Hotplugging memory to a memory-less node without cpus: node_start_pfn != 0 As soon as memory is onlined, node_start_pfn is overwritten with the actual start. E.g., when adding two DIMMs but only onlining one of both, only that DIMM (with online memory blocks) is spanned by the node. Currently, the validity of node_start_pfn really is linked to node_spanned_pages != 0. With node_spanned_pages == 0 (e.g., before onlining memory), it has no meaning. So let's stop setting node_start_pfn, just to be overwritten via move_pfn_range_to_zone(). This avoids confusion when looking at the code, wondering which magic will be performed with the node_start_pfn in this function, when hotadding a pgdat. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Acked-by: Pankaj Gupta Cc: Michal Hocko Cc: Baoquan He Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Anshuman Khandual Cc: Mike Rapoport Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200422155353.25381-1-david@redhat.com Link: http://lkml.kernel.org/r/20200422155353.25381-2-david@redhat.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bfe8cd2a685f..ee3dcb5ed945 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -862,10 +862,9 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_new_pgdat(int nid) { struct pglist_data *pgdat; - unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -895,9 +894,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); @@ -932,7 +930,6 @@ static void rollback_node_hotadd(int nid) /** * try_online_node - online a node if offlined * @nid: the node ID - * @start: start addr of the node * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * @@ -941,7 +938,7 @@ static void rollback_node_hotadd(int nid) * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ -static int __try_online_node(int nid, u64 start, bool set_node_online) +static int __try_online_node(int nid, bool set_node_online) { pg_data_t *pgdat; int ret = 1; @@ -949,7 +946,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_new_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -973,7 +970,7 @@ int try_online_node(int nid) int ret; mem_hotplug_begin(); - ret = __try_online_node(nid, 0, true); + ret = __try_online_node(nid, true); mem_hotplug_done(); return ret; } @@ -1032,7 +1029,7 @@ int __ref add_memory_resource(int nid, struct resource *res) */ memblock_add_node(start, size, nid); - ret = __try_online_node(nid, start, false); + ret = __try_online_node(nid, false); if (ret < 0) goto error; new_node = ret; -- cgit v1.2.3 From 52219aeaf2dc6f7607704af2c40e3866fb04aed2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:38 -0700 Subject: mm/memory_hotplug: handle memblocks only with CONFIG_ARCH_KEEP_MEMBLOCK The comment in add_memory_resource() is stale: hotadd_new_pgdat() will no longer call get_pfn_range_for_nid(), as a hotadded pgdat will simply span no pages at all, until memory is moved to the zone/node via move_pfn_range_to_zone() - e.g., when onlining memory blocks. The only archs that care about memblocks for hotplugged memory (either for iterating over all system RAM or testing for memory validity) are arm64, s390x, and powerpc - due to CONFIG_ARCH_KEEP_MEMBLOCK. Without CONFIG_ARCH_KEEP_MEMBLOCK, we can simply stop messing with memblocks. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Acked-by: Mike Rapoport Acked-by: Michal Hocko Cc: Michal Hocko Cc: Baoquan He Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Mike Rapoport Cc: Anshuman Khandual Link: http://lkml.kernel.org/r/20200422155353.25381-3-david@redhat.com Signed-off-by: Linus Torvalds --- mm/Kconfig | 3 +++ mm/memory_hotplug.c | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index e3490ecac839..5b28240d2af8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -133,6 +133,9 @@ config HAVE_FAST_GUP depends on MMU bool +# Don't discard allocated memory used to track "memory" and "reserved" memblocks +# after early boot, so it can still be used to test for validity of memory. +# Also, memblocks are updated with memory hot(un)plug. config ARCH_KEEP_MEMBLOCK bool diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ee3dcb5ed945..21bc3363a829 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1021,13 +1021,8 @@ int __ref add_memory_resource(int nid, struct resource *res) mem_hotplug_begin(); - /* - * Add new range to memblock so that when hotadd_new_pgdat() is called - * to allocate new pgdat, get_pfn_range_for_nid() will be able to find - * this new range and calculate total pages correctly. The range will - * be removed at hot-remove time. - */ - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); ret = __try_online_node(nid, false); if (ret < 0) @@ -1076,7 +1071,8 @@ error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); - memblock_remove(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); mem_hotplug_done(); return ret; } @@ -1673,8 +1669,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); arch_remove_memory(nid, start, size, NULL); - memblock_free(start, size); - memblock_remove(start, size); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + memblock_free(start, size); + memblock_remove(start, size); + } + __release_memory_resource(start, size); try_offline_node(nid); -- cgit v1.2.3 From 7b7b27214bba1966772f9213cd2d8e5d67f8487f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:41 -0700 Subject: mm/memory_hotplug: introduce add_memory_driver_managed() Patch series "mm/memory_hotplug: Interface to add driver-managed system ram", v4. kexec (via kexec_load()) can currently not properly handle memory added via dax/kmem, and will have similar issues with virtio-mem. kexec-tools will currently add all memory to the fixed-up initial firmware memmap. In case of dax/kmem, this means that - in contrast to a proper reboot - how that persistent memory will be used can no longer be configured by the kexec'd kernel. In case of virtio-mem it will be harmful, because that memory might contain inaccessible pieces that require coordination with hypervisor first. In both cases, we want to let the driver in the kexec'd kernel handle detecting and adding the memory, like during an ordinary reboot. Introduce add_memory_driver_managed(). More on the samentics are in patch #1. In the future, we might want to make this behavior configurable for dax/kmem- either by configuring it in the kernel (which would then also allow to configure kexec_file_load()) or in kexec-tools by also adding "System RAM (kmem)" memory from /proc/iomem to the fixed-up initial firmware memmap. More on the motivation can be found in [1] and [2]. [1] https://lkml.kernel.org/r/20200429160803.109056-1-david@redhat.com [2] https://lkml.kernel.org/r/20200430102908.10107-1-david@redhat.com This patch (of 3): Some device drivers rely on memory they managed to not get added to the initial (firmware) memmap as system RAM - so it's not used as initial system RAM by the kernel and the driver is under control. While this is the case during cold boot and after a reboot, kexec is not aware of that and might add such memory to the initial (firmware) memmap of the kexec kernel. We need ways to teach kernel and userspace that this system ram is different. For example, dax/kmem allows to decide at runtime if persistent memory is to be used as system ram. Another future user is virtio-mem, which has to coordinate with its hypervisor to deal with inaccessible parts within memory resources. We want to let users in the kernel (esp. kexec) but also user space (esp. kexec-tools) know that this memory has different semantics and needs to be handled differently: 1. Don't create entries in /sys/firmware/memmap/ 2. Name the memory resource "System RAM ($DRIVER)" (exposed via /proc/iomem) ($DRIVER might be "kmem", "virtio_mem"). 3. Flag the memory resource IORESOURCE_MEM_DRIVER_MANAGED /sys/firmware/memmap/ [1] represents the "raw firmware-provided memory map" because "on most architectures that firmware-provided memory map is modified afterwards by the kernel itself". The primary user is kexec on x86-64. Since commit d96ae5309165 ("memory-hotplug: create /sys/firmware/memmap entry for new memory"), we add all hotplugged memory to that firmware memmap - which makes perfect sense for traditional memory hotplug on x86-64, where real HW will also add hotplugged DIMMs to the firmware memmap. We replicate what the "raw firmware-provided memory map" looks like after hot(un)plug. To keep things simple, let the user provide the full resource name instead of only the driver name - this way, we don't have to manually allocate/craft strings for memory resources. Also use the resource name to make decisions, to avoid passing additional flags. In case the name isn't "System RAM", it's special. We don't have to worry about firmware_map_remove() on the removal path. If there is no entry, it will simply return with -EINVAL. We'll adapt dax/kmem in a follow-up patch. [1] https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-firmware-memmap Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Acked-by: Pankaj Gupta Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Cc: Baoquan He Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Tatashin Cc: Dan Williams Link: http://lkml.kernel.org/r/20200508084217.9160-1-david@redhat.com Link: http://lkml.kernel.org/r/20200508084217.9160-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 1 + include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 62 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index a9b9170b5dd2..cc9a5b4593ca 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -103,6 +103,7 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) +#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7dca9cd6076b..fee7fab5d706 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -342,6 +342,8 @@ extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); +extern int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern void remove_pfn_range_from_zone(struct zone *zone, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 21bc3363a829..c82722c3fe32 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -98,11 +98,14 @@ void mem_hotplug_done(void) u64 max_mem_size = U64_MAX; /* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) { struct resource *res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - char *resource_name = "System RAM"; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_MEM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -1057,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res) BUG_ON(ret); /* create new memmap entry */ - firmware_map_add_hotplug(start, start + size, "System RAM"); + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); @@ -1083,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - res = register_memory_resource(start, size); + res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); @@ -1105,6 +1109,56 @@ int add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(add_memory); +/* + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". + */ +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name) +{ + struct resource *res; + int rc; + + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; + + lock_device_hotplug(); + + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; + } + + rc = add_memory_resource(nid, res); + if (rc < 0) + release_memory_resource(res); + +out_unlock: + unlock_device_hotplug(); + return rc; +} +EXPORT_SYMBOL_GPL(add_memory_driver_managed); + #ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping -- cgit v1.2.3 From 3fe4f4991a2a818277445bd5b8b289305b7dd15d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:44 -0700 Subject: kexec_file: don't place kexec images on IORESOURCE_MEM_DRIVER_MANAGED Memory flagged with IORESOURCE_MEM_DRIVER_MANAGED is special - it won't be part of the initial memmap of the kexec kernel and not all memory might be accessible. Don't place any kexec images onto it. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Cc: Baoquan He Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Tatashin Cc: Dan Williams Link: http://lkml.kernel.org/r/20200508084217.9160-4-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..bb05fd52de85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; -- cgit v1.2.3 From 8a725e4694b52ffad755500277d36f3b2eb34755 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:48 -0700 Subject: device-dax: add memory via add_memory_driver_managed() Currently, when adding memory, we create entries in /sys/firmware/memmap/ as "System RAM". This will lead to kexec-tools to add that memory to the fixed-up initial memmap for a kexec kernel (loaded via kexec_load()). The memory will be considered initial System RAM by the kexec'd kernel and can no longer be reconfigured. This is not what happens during a real reboot. Let's add our memory via add_memory_driver_managed() now, so we won't create entries in /sys/firmware/memmap/ and indicate the memory as "System RAM (kmem)" in /proc/iomem. This allows everybody (especially kexec-tools) to identify that this memory is special and has to be treated differently than ordinary (hotplugged) System RAM. Before configuring the namespace: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-33fffffff : namespace0.0 3280000000-32ffffffff : PCI Bus 0000:00 After configuring the namespace: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-1481fffff : namespace0.0 148200000-33fffffff : dax0.0 3280000000-32ffffffff : PCI Bus 0000:00 After loading kmem before this change: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-1481fffff : namespace0.0 150000000-33fffffff : dax0.0 150000000-33fffffff : System RAM 3280000000-32ffffffff : PCI Bus 0000:00 After loading kmem after this change: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-1481fffff : namespace0.0 150000000-33fffffff : dax0.0 150000000-33fffffff : System RAM (kmem) 3280000000-32ffffffff : PCI Bus 0000:00 After a proper reboot: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-1481fffff : namespace0.0 148200000-33fffffff : dax0.0 3280000000-32ffffffff : PCI Bus 0000:00 Within the kexec kernel before this change: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-1481fffff : namespace0.0 150000000-33fffffff : System RAM 3280000000-32ffffffff : PCI Bus 0000:00 Within the kexec kernel after this change: [root@localhost ~]# cat /proc/iomem ... 140000000-33fffffff : Persistent Memory 140000000-1481fffff : namespace0.0 148200000-33fffffff : dax0.0 3280000000-32ffffffff : PCI Bus 0000:00 /sys/firmware/memmap/ before this change: 0000000000000000-000000000009fc00 (System RAM) 000000000009fc00-00000000000a0000 (Reserved) 00000000000f0000-0000000000100000 (Reserved) 0000000000100000-00000000bffdf000 (System RAM) 00000000bffdf000-00000000c0000000 (Reserved) 00000000feffc000-00000000ff000000 (Reserved) 00000000fffc0000-0000000100000000 (Reserved) 0000000100000000-0000000140000000 (System RAM) 0000000150000000-0000000340000000 (System RAM) /sys/firmware/memmap/ after a proper reboot: 0000000000000000-000000000009fc00 (System RAM) 000000000009fc00-00000000000a0000 (Reserved) 00000000000f0000-0000000000100000 (Reserved) 0000000000100000-00000000bffdf000 (System RAM) 00000000bffdf000-00000000c0000000 (Reserved) 00000000feffc000-00000000ff000000 (Reserved) 00000000fffc0000-0000000100000000 (Reserved) 0000000100000000-0000000140000000 (System RAM) /sys/firmware/memmap/ after this change: 0000000000000000-000000000009fc00 (System RAM) 000000000009fc00-00000000000a0000 (Reserved) 00000000000f0000-0000000000100000 (Reserved) 0000000000100000-00000000bffdf000 (System RAM) 00000000bffdf000-00000000c0000000 (Reserved) 00000000feffc000-00000000ff000000 (Reserved) 00000000fffc0000-0000000100000000 (Reserved) 0000000100000000-0000000140000000 (System RAM) kexec-tools already seem to basically ignore any System RAM that's not on top level when searching for areas to place kexec images - but also for determining crash areas to dump via kdump. Changing the resource name won't have an impact. Handle unloading of the driver after memory hotremove failed properly, by duplicating the string if necessary. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Acked-by: Pankaj Gupta Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Cc: Baoquan He Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Tatashin Cc: Dan Williams Link: http://lkml.kernel.org/r/20200508084217.9160-5-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/dax/dax-private.h | 1 + drivers/dax/kmem.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 3107ce80e809..16850d5388ab 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -44,6 +44,7 @@ struct dax_region { * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) * @dax_mem_res: physical address range of hotadded DAX memory + * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed() */ struct dev_dax { struct dax_region *region; diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 1e678bdf5aed..275aa5f87399 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -14,6 +14,11 @@ #include "dax-private.h" #include "bus.h" +/* Memory resource name used for add_memory_driver_managed(). */ +static const char *kmem_name; +/* Set if any memory will remain added when the driver will be unloaded. */ +static bool any_hotremove_failed; + int dev_dax_kmem_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -70,7 +75,12 @@ int dev_dax_kmem_probe(struct device *dev) */ new_res->flags = IORESOURCE_SYSTEM_RAM; - rc = add_memory(numa_node, new_res->start, resource_size(new_res)); + /* + * Ensure that future kexec'd kernels will not treat this as RAM + * automatically. + */ + rc = add_memory_driver_managed(numa_node, new_res->start, + resource_size(new_res), kmem_name); if (rc) { release_resource(new_res); kfree(new_res); @@ -100,6 +110,7 @@ static int dev_dax_kmem_remove(struct device *dev) */ rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); if (rc) { + any_hotremove_failed = true; dev_err(dev, "DAX region %pR cannot be hotremoved until the next reboot\n", res); @@ -124,6 +135,7 @@ static int dev_dax_kmem_remove(struct device *dev) * permanently pinned as reserved by the unreleased * request_mem_region(). */ + any_hotremove_failed = true; return 0; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -137,12 +149,24 @@ static struct dax_device_driver device_dax_kmem_driver = { static int __init dax_kmem_init(void) { - return dax_driver_register(&device_dax_kmem_driver); + int rc; + + /* Resource name is permanently allocated if any hotremove fails. */ + kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); + if (!kmem_name) + return -ENOMEM; + + rc = dax_driver_register(&device_dax_kmem_driver); + if (rc) + kfree_const(kmem_name); + return rc; } static void __exit dax_kmem_exit(void) { dax_driver_unregister(&device_dax_kmem_driver); + if (!any_hotremove_failed) + kfree_const(kmem_name); } MODULE_AUTHOR("Intel Corporation"); -- cgit v1.2.3 From b59d02ed086907e2d4ea3ecf0720f0cbd54a2601 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 4 Jun 2020 16:48:51 -0700 Subject: mm/memory_hotplug: disable the functionality for 32b Memory hotlug is broken for 32b systems at least since c6f03e2903c9 ("mm, memory_hotplug: remove zone restrictions") which has considerably reworked how can be memory associated with movable/kernel zones. The same is not really trivial to achieve in 32b where only lowmem is the kernel zone. While we can tweak this immediate problem around there are likely other land mines hidden at other places. It is also quite dubious that there is a real usecase for the memory hotplug on 32b in the first place. Low memory is just too small to be hotplugable (for hot add) and generally unusable for hotremove. Adding more memory to highmem is also dubious because it would increase the low mem or vmalloc space pressure for memmaps. Restrict the functionality to 64b systems. This will help future development to focus on usecases that have real life application. We can remove this restriction in future in presence of a real life usecase of course but until then make it explicit that hotplug on 32b is broken and requires a non trivial amount of work to fix. Robin said: "32-bit Arm doesn't support memory hotplug, and as far as I'm aware there's little likelihood of it ever wanting to. FWIW it looks like SuperH is the only pure-32-bit architecture to have hotplug support at all" Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Acked-by: Baoquan He Cc: Wei Yang Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Robin Murphy Cc: Vamshi K Sthambamkadi Link: http://lkml.kernel.org/r/20200218100532.GA4151@dhcp22.suse.cz Link: https://bugzilla.kernel.org/show_bug.cgi?id=206401 Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 5b28240d2af8..cffc276fa19c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -158,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + depends on 64BIT || BROKEN select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE -- cgit v1.2.3 From 57e86fa16a707db9e05dec77eb88a398f05a022b Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Thu, 4 Jun 2020 16:48:55 -0700 Subject: mm: replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") [akpm@linux-foundation.org: fix build] Signed-off-by: chenqiwu Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Wei Yang Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Michal Hocko Cc: Pankaj Gupta Cc: Yang Shi Cc: Qian Cai Cc: Baoquan He Link: http://lkml.kernel.org/r/1586599916-15456-1-git-send-email-qiwuchen55@gmail.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5bfc36320e3c..1744081a34d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1726,7 +1726,7 @@ struct frame_vector { unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ - void *ptrs[0]; /* Array of pinned pfns / pages. Use + void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; -- cgit v1.2.3 From 52cfc24578c32aa4d604fcb38e081ed0eb4cfb5c Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:48:58 -0700 Subject: mm/memory_hotplug: fix a typo in comment "recoreded"->"recorded" There is a typo in comment, fix it. s/recoreded/recorded Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200410160328.13843-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c82722c3fe32..c4d5c45820d0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1337,7 +1337,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, } /* - * Check all pages in range, recoreded as memory resource, are isolated. + * Check all pages in range, recorded as memory resource, are isolated. */ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, -- cgit v1.2.3 From 457aef949de9efe9cf96a2452f1f372bc158f777 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:01 -0700 Subject: mm: ksm: fix a typo in comment "alreaady"->"already" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200410162427.13927-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/ksm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 281c00129a2e..18c5d005bd01 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -612,7 +612,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages - * that are wrprotected and have the exact same + * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); @@ -1148,7 +1148,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, /* * No need to check ksm_use_zero_pages here: we can only have a - * zero_page here if ksm_use_zero_pages was enabled alreaady. + * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); @@ -1608,7 +1608,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -1843,7 +1843,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -2001,7 +2001,7 @@ static void stable_tree_append(struct rmap_item *rmap_item, * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check - * for other negative values as an undeflow if detected here + * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ -- cgit v1.2.3 From b4f315b40d43aec9234556ce678e3c3469cc838e Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:04 -0700 Subject: mm: mmap: fix a typo in comment "compatbility"->"compatibility" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200410163206.14016-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index f609e9ec4a25..39bd60c20a82 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* - * Rough compatbility check to quickly see if it's even worth looking + * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ -- cgit v1.2.3 From 7c8de3588972eddc0b4fb6f71be470b12b171d9d Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:07 -0700 Subject: mm/hugetlb: fix a typos in comments [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200410163714.14085-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac0d7bbc0692..dcb34d7f5562 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -85,7 +85,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, give up any reservations mased on minimum size and + * remain, give up any reservations based on minimum size and * free the subpool */ if (free) { if (spool->min_hpages != -1) @@ -133,7 +133,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * the request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where - * a subpool minimum size must be manitained. + * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) @@ -473,7 +473,7 @@ out_of_memory: * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for - * this operation and we were not able to allocate, it ruturns -ENOMEM. + * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. @@ -988,7 +988,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) * We know VM_NORESERVE is not set. Therefore, there SHOULD * be a region map for all pages. The only situation where * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reverves to + * fallocate. In this case, there really are no reserves to * use. This situation is indicated if chg != 0. */ if (chg) @@ -1519,7 +1519,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwse drivers using get_user_pages() to access tail + * too. Otherwise drivers using get_user_pages() to access tail * pages may get the reference counting wrong if they see * PG_reserved set on a tail page (despite the head page not * having PG_reserved set). Enforcing this consistency between @@ -4579,9 +4579,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * entry could be a migration/hwpoison entry at this point, so this * check prevents the kernel from going below assuming that we have - * a active hugepage in pagecache. This goto expects the 2nd page fault, - * and is_hugetlb_entry_(migration|hwpoisoned) check will properly - * handle it. + * an active hugepage in pagecache. This goto expects the 2nd page + * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will + * properly handle it. */ if (!pte_present(entry)) goto out_mutex; -- cgit v1.2.3 From 55b65a57c2542165933611a7abb6e22b797acac0 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:10 -0700 Subject: mm/vmsan: fix some typos in comment There are some typos, fix them. s/regsitration/registration s/santity/sanity s/decremeting/decrementing Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200411071544.16222-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3792dd19788c..b6d84326bdf2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -682,7 +682,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, freed += ret; /* * Bail out if someone want to register a new shrinker to - * prevent the regsitration from being stalled for long periods + * prevent the registration from being stalled for long periods * by parallel ongoing shrinking. */ if (rwsem_is_contended(&shrinker_rwsem)) { @@ -1613,7 +1613,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) /* * Update LRU sizes after isolating pages. The LRU size updates must - * be complete before mem_cgroup_update_lru_size due to a santity check. + * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) @@ -2371,7 +2371,7 @@ out: /* * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decremeting + * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); -- cgit v1.2.3 From f386775510bf35d8ad28ad77e8d82524957abde7 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:13 -0700 Subject: mm/compaction: fix a typo in comment "pessemistic"->"pessimistic" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200411070307.16021-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 14d2fe231ea4..fd988b7e5f2b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1401,7 +1401,7 @@ fast_isolate_freepages(struct compact_control *cc) if (scan_start) { /* * Use the highest PFN found above min. If one was - * not found, be pessemistic for direct compaction + * not found, be pessimistic for direct compaction * and use the min mark. */ if (highest) { -- cgit v1.2.3 From df1758d9f28539091b9b9636b8485697b043b80b Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:16 -0700 Subject: mm/memblock: fix a typo in comment "implict"->"implicit" There is a typo in commet, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200411070701.16097-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index 743659d88fc4..39aceafc57f6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -78,7 +78,7 @@ * * memblock_alloc*() - these functions return the **virtual** address * of the allocated memory. * - * Note, that both API variants use implict assumptions about allowed + * Note, that both API variants use implicit assumptions about allowed * memory ranges and the fallback methods. Consult the documentation * of memblock_alloc_internal() and memblock_alloc_range_nid() * functions for more elaborate description. -- cgit v1.2.3 From 3dc5f032c4baae1b41977fab649c104015e3dcff Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:19 -0700 Subject: mm/list_lru: fix a typo in comment "numbesr"->"numbers" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200411071041.16161-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/list_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 4d5294c39bba..9222910ab1cb 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -213,7 +213,7 @@ restart: /* * decrement nr_to_walk first so that we don't livelock if we - * get stuck on large numbesr of LRU_RETRY items + * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; -- cgit v1.2.3 From ffceeb62fce4b819f295bc724b413d62364f8d16 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:22 -0700 Subject: mm/filemap: fix a typo in comment "unneccssary"->"unnecessary" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200411065141.15936-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 455990621989..b1a41890d80e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1256,7 +1256,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * instead. * * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unneccssary since it is + * being cleared, but a memory barrier should be unnecessary since it is * in the same byte as PG_locked. */ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) -- cgit v1.2.3 From 404f3ecfd86b14c2087901f11360d70bea05523e Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:25 -0700 Subject: mm/frontswap: fix some typos in frontswap.c There are some typos in comment, fix them. s/Fortunatly/Fortunately s/taked/taken s/necessory/necessary s/shink/shrink Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200411064009.15727-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/frontswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index 60bb20e8a951..bfa3a339253e 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { } * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). - * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * Fortunately for us, the swapon_mutex has been taken by the callee so we are * OK. The other scenario where calls to frontswap_store (called via * swap_writepage) is racing with frontswap_invalidate_area (called via * swapoff) is again guarded by the swap subsystem. @@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } /* - * Used to check if it's necessory and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shink pages, + * Used to check if it's necessary and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shrink pages, * error code when there is an error. */ static int __frontswap_shrink(unsigned long target_pages, -- cgit v1.2.3 From b8f2935f72448940b1ea8f5caf7fa984a4775fbc Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:28 -0700 Subject: mm, memcg: fix some typos in memcontrol.c There are some typos in comment, fix them. s/responsiblity/responsibility s/oflline/offline Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200411064246.15781-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5381afb23d58..3dde78f5b918 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3186,7 +3186,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsiblity. + * hierarchy. Testing use_hierarchy is the caller's responsibility. */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { @@ -4838,7 +4838,7 @@ static struct cftype mem_cgroup_legacy_files[] = { * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of * memory-controlled cgroups to 64k. * - * However, there usually are many references to the oflline CSS after + * However, there usually are many references to the offline CSS after * the cgroup has been destroyed, such as page cache or reclaimable * slab objects, that don't need to hang on to the ID. We want to keep * those dead CSS from occupying IDs, or we might quickly exhaust the -- cgit v1.2.3 From 68956ccb6ca96bd8873c1a0d30d9749094090922 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:31 -0700 Subject: mm: fix a typo in comment "strucure"->"structure" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Reviewed-by: Ralph Campbell Link: http://lkml.kernel.org/r/20200411064723.15855-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 9117bca90f4b..791e4b5a807c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -132,7 +132,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed - * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { -- cgit v1.2.3 From 0d645ed19cf9452827d69b733beeff58ed32ea56 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:34 -0700 Subject: mm/slub: fix a typo in comment "disambiguiation"->"disambiguation" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Acked-by: David Rientjes Link: http://lkml.kernel.org/r/20200411002247.14468-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index d52487919278..b8f798b50d44 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2013,7 +2013,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, #ifdef CONFIG_PREEMPTION /* - * Calculate the next globally unique transaction for disambiguiation + * Calculate the next globally unique transaction for disambiguation * during cmpxchg. The transactions start with the cpu number and are then * incremented by CONFIG_NR_CPUS. */ -- cgit v1.2.3 From 2e6787d380620e87b7d0ccbc0e52f7024a49efd1 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:37 -0700 Subject: mm/sparse: fix a typo in comment "convienence"->"convenience" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200411002955.14545-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 1aee5a481571..6284328cd9f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -288,7 +288,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) /* * Mark all memblocks as present using memory_present(). This is a - * convienence function that is useful for a number of arches + * convenience function that is useful for a number of arches * to mark all of the systems memory as present during initialization. */ void __init memblocks_present(void) -- cgit v1.2.3 From e0857cf5ac107b663f1b43ac8e02fdcd9284ad72 Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:40 -0700 Subject: mm/page-writeback: fix a typo in comment "effictive"->"effective" There is a typo in comment, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200411003513.14613-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d79ed1f88c7a..28b3e7a67565 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * requiring writeback. * * This number of dirtyable pages is the base value of which the - * user-configurable dirty ratio is the effictive number of pages that + * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * -- cgit v1.2.3 From 985ba004be29b857f3475eea961af5ac6ea5fa4d Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Thu, 4 Jun 2020 16:49:43 -0700 Subject: mm/memory: fix a typo in comment "attampt"->"attempt" There is a comment in typo, fix it. Signed-off-by: Ethon Paul Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200411004043.14686-1-ethp@qq.com Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 60c279295fce..d97e8848892d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2467,7 +2467,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, } /* - * The same page can be mapped back since last copy attampt. + * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { -- cgit v1.2.3 From fa1f68cc88f1dce6b7bb37628eb7c25c96593183 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Thu, 4 Jun 2020 16:49:46 -0700 Subject: mm: use false for bool variable Fixes coccicheck warnings: mm/zbud.c:246:1-20: WARNING: Assignment of 0/1 to bool variable mm/mremap.c:777:2-8: WARNING: Assignment of 0/1 to bool variable mm/huge_memory.c:525:9-10: WARNING: return of 0/1 in function 'is_transparent_hugepage' with return type bool Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/1586835930-47076-1-git-send-email-zou_wei@huawei.com Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- mm/mremap.c | 2 +- mm/zbud.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e8669885232f..4368e964d2aa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page) bool is_transparent_hugepage(struct page *page) { if (!PageCompound(page)) - return 0; + return false; page = compound_head(page); return is_huge_zero_page(page) || diff --git a/mm/mremap.c b/mm/mremap.c index 6aa6ea605068..dccb7a4471a0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -785,7 +785,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, out: if (offset_in_page(ret)) { vm_unacct_memory(charged); - locked = 0; + locked = false; } if (downgraded) up_read(¤t->mm->mmap_sem); diff --git a/mm/zbud.c b/mm/zbud.c index de5dd4ddaa82..bc93aa4e46fc 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page) zhdr->last_chunks = 0; INIT_LIST_HEAD(&zhdr->buddy); INIT_LIST_HEAD(&zhdr->lru); - zhdr->under_reclaim = 0; + zhdr->under_reclaim = false; return zhdr; } -- cgit v1.2.3 From 2b7874490243e014112100925405c4a17a8c40aa Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 4 Jun 2020 16:49:49 -0700 Subject: include/linux/mm.h: return true in cpupid_pid_unset() Fix the following coccicheck warning: include/linux/mm.h:1371:8-9: WARNING: return of 0/1 in function 'cpupid_pid_unset' with return type bool Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200422071816.48879-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1744081a34d4..86adc71a972f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1373,7 +1373,7 @@ static inline int cpu_pid_to_cpupid(int nid, int pid) static inline bool cpupid_pid_unset(int cpupid) { - return 1; + return true; } static inline void page_cpupid_reset_last(struct page *page) -- cgit v1.2.3 From 276aa42e9ff3a9dcea6a91d515916c653c5f9c6d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Jun 2020 16:49:52 -0700 Subject: zcomp: Use ARRAY_SIZE() for backends list Instead of keeping NULL terminated array switch to use ARRAY_SIZE() which helps to further clean up. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Acked-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Jens Axboe Cc: Andy Shevchenko Link: http://lkml.kernel.org/r/20200508100758.51644-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5ee8e3fae551..33e3b76c4fa9 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -29,7 +29,6 @@ static const char * const backends[] = { #if IS_ENABLED(CONFIG_CRYPTO_ZSTD) "zstd", #endif - NULL }; static void zcomp_strm_free(struct zcomp_strm *zstrm) @@ -64,7 +63,7 @@ bool zcomp_available_algorithm(const char *comp) { int i; - i = __sysfs_match_string(backends, -1, comp); + i = sysfs_match_string(backends, comp); if (i >= 0) return true; @@ -83,9 +82,9 @@ ssize_t zcomp_available_show(const char *comp, char *buf) { bool known_algorithm = false; ssize_t sz = 0; - int i = 0; + int i; - for (; backends[i]; i++) { + for (i = 0; i < ARRAY_SIZE(backends); i++) { if (!strcmp(comp, backends[i])) { known_algorithm = true; sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, -- cgit v1.2.3 From 8977a27b663ebb3bc745a1e7f91b2927b4522861 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 4 Jun 2020 16:49:55 -0700 Subject: proc: rename "catch" function argument "catch" is reserved keyword in C++, rename it to something both gcc and g++ accept. Rename "ign" for symmetry. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200331210905.GA31680@avx2 Signed-off-by: Linus Torvalds --- fs/proc/array.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 8e16f14bb05a..c4ac25fdb230 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -248,8 +248,8 @@ void render_sigset_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } -static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, - sigset_t *catch) +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, + sigset_t *sigcatch) { struct k_sigaction *k; int i; @@ -257,9 +257,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) - sigaddset(ign, i); + sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) - sigaddset(catch, i); + sigaddset(sigcatch, i); } } -- cgit v1.2.3 From de83dbd97f173650a602c5e356025b732173ecc4 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 4 Jun 2020 16:49:58 -0700 Subject: user.c: make uidhash_table static Fix the following sparse warning: kernel/user.c:85:19: warning: symbol 'uidhash_table' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Cc: David Howells Cc: Greg Kroah-Hartman Cc: Rasmus Villemoes Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200413082146.22737-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is -- cgit v1.2.3 From 0c78c013762142bfe8fce34e7e968f83f0a4b891 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:50:01 -0700 Subject: get_maintainer: add email addresses from .yaml files .yaml files can contain maintainer/author addresses and it seems unlikely or unnecessary that individual MAINTAINER file section entries for each .yaml file will be created. So add the email addresses found in .yaml files to the default get_maintainer output. The email addresses are marked with "(in file)" when using the "--roles" or "--rolestats" options. Miscellanea: o Change $file_emails to $email_file_emails to avoid visual naming conflicts with @file_emails Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Tested-by: Sam Ravnborg Acked-by: Sam Ravnborg Link: http://lkml.kernel.org/r/e85006456d9dbae55286c67ac5263668a72f5b58.1588022228.git.joe@perches.com Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6cbcd1a3e113..6d973f3685f9 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -57,7 +57,7 @@ my $status = 0; my $letters = ""; my $keywords = 1; my $sections = 0; -my $file_emails = 0; +my $email_file_emails = 0; my $from_filename = 0; my $pattern_depth = 0; my $self_test = undef; @@ -69,6 +69,12 @@ my $vcs_used = 0; my $exit = 0; +my @files = (); +my @fixes = (); # If a patch description includes Fixes: lines +my @range = (); +my @keyword_tvi = (); +my @file_emails = (); + my %commit_author_hash; my %commit_signer_hash; @@ -266,7 +272,7 @@ if (!GetOptions( 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, 'sections!' => \$sections, - 'fe|file-emails!' => \$file_emails, + 'fe|file-emails!' => \$email_file_emails, 'f|file' => \$from_filename, 'find-maintainer-files' => \$find_maintainer_files, 'mpath|maintainer-path=s' => \$maintainer_path, @@ -424,6 +430,22 @@ sub read_all_maintainer_files { } } +sub maintainers_in_file { + my ($file) = @_; + + return if ($file =~ m@\bMAINTAINERS$@); + + if (-f $file && ($email_file_emails || $file =~ /\.yaml$/)) { + open(my $f, '<', $file) + or die "$P: Can't open $file: $!\n"; + my $text = do { local($/) ; <$f> }; + close($f); + + my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; + push(@file_emails, clean_file_emails(@poss_addr)); + } +} + # # Read mail address map # @@ -504,12 +526,6 @@ sub read_mailmap { ## use the filenames on the command line or find the filenames in the patchfiles -my @files = (); -my @fixes = (); # If a patch description includes Fixes: lines -my @range = (); -my @keyword_tvi = (); -my @file_emails = (); - if (!@ARGV) { push(@ARGV, "&STDIN"); } @@ -527,7 +543,7 @@ foreach my $file (@ARGV) { $file =~ s/^\Q${cur_path}\E//; #strip any absolute path $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree push(@files, $file); - if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) { + if ($file ne "MAINTAINERS" && -f $file && $keywords) { open(my $f, '<', $file) or die "$P: Can't open $file: $!\n"; my $text = do { local($/) ; <$f> }; @@ -539,10 +555,6 @@ foreach my $file (@ARGV) { } } } - if ($file_emails) { - my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; - push(@file_emails, clean_file_emails(@poss_addr)); - } } } else { my $file_cnt = @files; @@ -923,6 +935,8 @@ sub get_maintainers { print("\n"); } } + + maintainers_in_file($file); } if ($keywords) { @@ -1835,7 +1849,7 @@ tm toggle maintainers tg toggle git entries tl toggle open list entries ts toggle subscriber list entries -f emails in file [$file_emails] +f emails in file [$email_file_emails] k keywords in file [$keywords] r remove duplicates [$email_remove_duplicates] p# pattern match depth [$pattern_depth] @@ -1960,7 +1974,7 @@ EOT bool_invert(\$email_git_all_signature_types); $rerun = 1; } elsif ($sel eq "f") { - bool_invert(\$file_emails); + bool_invert(\$email_file_emails); $rerun = 1; } elsif ($sel eq "r") { bool_invert(\$email_remove_duplicates); -- cgit v1.2.3 From e33c9fe8b80ca86392a35ffff81fbfb6a54d2d22 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:50:04 -0700 Subject: get_maintainer: fix unexpected behavior for path/to//file (double slashes) get_maintainer behaves differently if there is a double sequential forward slash in a filename because the total number of slashes in a filename is used to match MAINTAINERS file patterns. For example: (with double slash) $ ./scripts/get_maintainer.pl -f drivers/gpu/drm//lima David Airlie (maintainer:DRM DRIVERS) Daniel Vetter (maintainer:DRM DRIVERS,commit_signer:3/42=7%) Qiang Yu (commit_signer:36/42=86%,authored:24/42=57%) Vasily Khoruzhick (commit_signer:26/42=62%) Krzysztof Kozlowski (commit_signer:5/42=12%,authored:5/42=12%) Emil Velikov (commit_signer:4/42=10%) dri-devel@lists.freedesktop.org (open list:DRM DRIVERS) linux-kernel@vger.kernel.org (open list) (without double slash) $ ./scripts/get_maintainer.pl -f drivers/gpu/drm/lima Qiang Yu (maintainer:DRM DRIVERS FOR LIMA) David Airlie (maintainer:DRM DRIVERS) Daniel Vetter (maintainer:DRM DRIVERS) dri-devel@lists.freedesktop.org (open list:DRM DRIVERS FOR LIMA) lima@lists.freedesktop.org (moderated list:DRM DRIVERS FOR LIMA) linux-kernel@vger.kernel.org (open list) So reduce consecutive double slashes to a single slash by using File::Spec->canonpath(). from: https://perldoc.perl.org/File/Spec/Unix.html canonpath() No physical check on the filesystem, but a logical cleanup of a path. On UNIX eliminates successive slashes and successive "/.". Reported-by: Emil Velikov Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/9a18b611813bb409fef15bc8927adab79eb9be43.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6d973f3685f9..484d2fbf5921 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -19,6 +19,7 @@ my $V = '0.26'; use Getopt::Long qw(:config no_auto_abbrev); use Cwd; use File::Find; +use File::Spec::Functions; my $cur_path = fastgetcwd() . '/'; my $lk_path = "./"; @@ -532,6 +533,7 @@ if (!@ARGV) { foreach my $file (@ARGV) { if ($file ne "&STDIN") { + $file = canonpath($file); ##if $file is a directory and it lacks a trailing slash, add one if ((-d $file)) { $file =~ s@([^/])$@$1/@; -- cgit v1.2.3 From 9ac17575804024fb3d5692cad7afc08929bab981 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 4 Jun 2020 16:50:08 -0700 Subject: lib/math: avoid trailing newline hidden in pr_fmt() pr_xxx() functions usually have a newline at the end of the logging message. Here, this newline is added via the 'pr_fmt' macro. In order to be more consistent with other files, use a more standard convention and put these newlines back in the messages themselves and remove it from the pr_fmt macro. While at it, use __func__ instead of hardcoding a function name in the last message. Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton Reviewed-by: Andy Shevchenko Cc: Mauro Carvalho Chehab Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200409163234.22830-1-christophe.jaillet@wanadoo.fr Signed-off-by: Linus Torvalds --- lib/math/prime_numbers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c index 052f5b727be7..d42cebf7407f 100644 --- a/lib/math/prime_numbers.c +++ b/lib/math/prime_numbers.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "prime numbers: " fmt "\n" +#define pr_fmt(fmt) "prime numbers: " fmt #include #include @@ -253,7 +253,7 @@ static void dump_primes(void) if (buf) bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); - pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s", + pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s\n", p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); rcu_read_unlock(); @@ -273,7 +273,7 @@ static int selftest(unsigned long max) bool fast = is_prime_number(x); if (slow != fast) { - pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!", + pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!\n", x, slow ? "yes" : "no", fast ? "yes" : "no"); goto err; } @@ -282,14 +282,14 @@ static int selftest(unsigned long max) continue; if (next_prime_number(last) != x) { - pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu", + pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu\n", last, x, next_prime_number(last)); goto err; } last = x; } - pr_info("selftest(%lu) passed, last prime was %lu", x, last); + pr_info("%s(%lu) passed, last prime was %lu\n", __func__, x, last); return 0; err: -- cgit v1.2.3 From 07887358993d48571f0f3a25cfce715564b35587 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 4 Jun 2020 16:50:11 -0700 Subject: lib: Add might_fault() to strncpy_from_user. When updating a piece of broken logic from using get_user to strncpy_from_user, we noticed that a warning which is expected when calling a function that might fault from an atomic context with pagefaults enabled disappeared. Not having this warning in place can lead to calling strncpy_from_user from an atomic context and eventually kernel crashes/stack corruption. Signed-off-by: KP Singh Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Jann Horn Cc: Christophe Leroy Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20200414225705.255711-1-kpsingh@chromium.org Signed-off-by: Linus Torvalds --- lib/strncpy_from_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index b90ec550183a..34696a348864 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -98,6 +98,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; + might_fault(); if (unlikely(count <= 0)) return 0; -- cgit v1.2.3 From 02223e36f315be7e647aa4fbb09a8f8b5a2fa43e Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 4 Jun 2020 16:50:14 -0700 Subject: lib/test_lockup.c: make test_inode static Fix the following sparse warning: lib/test_lockup.c:145:14: warning: symbol 'test_inode' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200417074021.46411-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- lib/test_lockup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_lockup.c b/lib/test_lockup.c index ea09ca335b21..419fbaceba73 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -142,7 +142,7 @@ module_param(reallocate_pages, bool, 0400); MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations"); struct file *test_file; -struct inode *test_inode; +static struct inode *test_inode; static char test_file_path[256]; module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400); MODULE_PARM_DESC(file_path, "file path to test"); -- cgit v1.2.3 From acaab7335bd6f0c0b54ce3a00bd7f18222ce0f5f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Jun 2020 16:50:17 -0700 Subject: lib/zlib: remove outdated and incorrect pre-increment optimization The zlib inflate code has an old micro-optimization based on the assumption that for pre-increment memory accesses, the compiler will generate code that fits better into the processor's pipeline than what would be generated for post-increment memory accesses. This optimization was already removed in upstream zlib in 2016: https://github.com/madler/zlib/commit/9aaec95e8211 This optimization causes UB according to C99, which says in section 6.5.6 "Additive operators": "If both the pointer operand and the result point to elements of the same array object, or one past the last element of the array object, the evaluation shall not produce an overflow; otherwise, the behavior is undefined". This UB is not only a theoretical concern, but can also cause trouble for future work on compiler-based sanitizers. According to the zlib commit, this optimization also is not optimal anymore with modern compilers. Replace uses of OFF, PUP and UP_UNALIGNED with their definitions in the POSTINC case, and remove the macro definitions, just like in the upstream patch. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Cc: Mikhail Zaslonko Link: http://lkml.kernel.org/r/20200507123112.252723-1-jannh@google.com Signed-off-by: Linus Torvalds --- lib/zlib_inflate/inffast.c | 91 ++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 56 deletions(-) diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 2c13ecc5bb2c..ed1f3df27260 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -10,17 +10,6 @@ #ifndef ASMINF -/* Allow machine dependent optimization for post-increment or pre-increment. - Based on testing to date, - Pre-increment preferred for: - - PowerPC G3 (Adler) - - MIPS R5000 (Randers-Pehrson) - Post-increment preferred for: - - none - No measurable difference: - - Pentium III (Anderson) - - M68060 (Nikl) - */ union uu { unsigned short us; unsigned char b[2]; @@ -38,16 +27,6 @@ get_unaligned16(const unsigned short *p) return mm.us; } -#ifdef POSTINC -# define OFF 0 -# define PUP(a) *(a)++ -# define UP_UNALIGNED(a) get_unaligned16((a)++) -#else -# define OFF 1 -# define PUP(a) *++(a) -# define UP_UNALIGNED(a) get_unaligned16(++(a)) -#endif - /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -115,9 +94,9 @@ void inflate_fast(z_streamp strm, unsigned start) /* copy state to local variables */ state = (struct inflate_state *)strm->state; - in = strm->next_in - OFF; + in = strm->next_in; last = in + (strm->avail_in - 5); - out = strm->next_out - OFF; + out = strm->next_out; beg = out - (start - strm->avail_out); end = out + (strm->avail_out - 257); #ifdef INFLATE_STRICT @@ -138,9 +117,9 @@ void inflate_fast(z_streamp strm, unsigned start) input data or output space */ do { if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } this = lcode[hold & lmask]; @@ -150,14 +129,14 @@ void inflate_fast(z_streamp strm, unsigned start) bits -= op; op = (unsigned)(this.op); if (op == 0) { /* literal */ - PUP(out) = (unsigned char)(this.val); + *out++ = (unsigned char)(this.val); } else if (op & 16) { /* length base */ len = (unsigned)(this.val); op &= 15; /* number of extra bits */ if (op) { if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } len += (unsigned)hold & ((1U << op) - 1); @@ -165,9 +144,9 @@ void inflate_fast(z_streamp strm, unsigned start) bits -= op; } if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } this = dcode[hold & dmask]; @@ -180,10 +159,10 @@ void inflate_fast(z_streamp strm, unsigned start) dist = (unsigned)(this.val); op &= 15; /* number of extra bits */ if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } } @@ -205,13 +184,13 @@ void inflate_fast(z_streamp strm, unsigned start) state->mode = BAD; break; } - from = window - OFF; + from = window; if (write == 0) { /* very common case */ from += wsize - op; if (op < len) { /* some from window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } @@ -222,14 +201,14 @@ void inflate_fast(z_streamp strm, unsigned start) if (op < len) { /* some from end of window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); - from = window - OFF; + from = window; if (write < len) { /* some from start of window */ op = write; len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } @@ -240,21 +219,21 @@ void inflate_fast(z_streamp strm, unsigned start) if (op < len) { /* some from window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } } while (len > 2) { - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); + *out++ = *from++; + *out++ = *from++; + *out++ = *from++; len -= 3; } if (len) { - PUP(out) = PUP(from); + *out++ = *from++; if (len > 1) - PUP(out) = PUP(from); + *out++ = *from++; } } else { @@ -264,29 +243,29 @@ void inflate_fast(z_streamp strm, unsigned start) from = out - dist; /* copy direct from output */ /* minimum length is three */ /* Align out addr */ - if (!((long)(out - 1 + OFF) & 1)) { - PUP(out) = PUP(from); + if (!((long)(out - 1) & 1)) { + *out++ = *from++; len--; } - sout = (unsigned short *)(out - OFF); + sout = (unsigned short *)(out); if (dist > 2) { unsigned short *sfrom; - sfrom = (unsigned short *)(from - OFF); + sfrom = (unsigned short *)(from); loops = len >> 1; do #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - PUP(sout) = PUP(sfrom); + *sout++ = *sfrom++; #else - PUP(sout) = UP_UNALIGNED(sfrom); + *sout++ = get_unaligned16(sfrom++); #endif while (--loops); - out = (unsigned char *)sout + OFF; - from = (unsigned char *)sfrom + OFF; + out = (unsigned char *)sout; + from = (unsigned char *)sfrom; } else { /* dist == 1 or dist == 2 */ unsigned short pat16; - pat16 = *(sout-1+OFF); + pat16 = *(sout-1); if (dist == 1) { union uu mm; /* copy one char pattern to both bytes */ @@ -296,12 +275,12 @@ void inflate_fast(z_streamp strm, unsigned start) } loops = len >> 1; do - PUP(sout) = pat16; + *sout++ = pat16; while (--loops); - out = (unsigned char *)sout + OFF; + out = (unsigned char *)sout; } if (len & 1) - PUP(out) = PUP(from); + *out++ = *from++; } } else if ((op & 64) == 0) { /* 2nd level distance code */ @@ -336,8 +315,8 @@ void inflate_fast(z_streamp strm, unsigned start) hold &= (1U << bits) - 1; /* update state and return */ - strm->next_in = in + OFF; - strm->next_out = out + OFF; + strm->next_in = in; + strm->next_out = out; strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); -- cgit v1.2.3 From a818e526cb4e5f8b06f4d648b88b269fbdd85bdd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:50:20 -0700 Subject: lib/percpu-refcount.c: use a more common logging style Remove the trailing newline from the used-once pr_fmt and add it to the single use of pr_ in this code to use a more common logging style. Miscellanea: o Use %lu in the pr_debug format and remove the unnecessary cast Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Cc: Christophe JAILLET Link: http://lkml.kernel.org/r/47372467902a047c03b0fd29aab56e0c38d3f848.camel@perches.com Signed-off-by: Linus Torvalds --- lib/percpu-refcount.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 8d092609928e..0ba686b8fe57 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "%s: " fmt "\n", __func__ +#define pr_fmt(fmt) "%s: " fmt, __func__ #include #include @@ -141,8 +141,8 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); - pr_debug("global %ld percpu %ld", - atomic_long_read(&ref->count), (long)count); + pr_debug("global %lu percpu %lu\n", + atomic_long_read(&ref->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum -- cgit v1.2.3 From 63d7f8167fe65891617d5eca6314eca46029955c Mon Sep 17 00:00:00 2001 From: Tan Hu Date: Thu, 4 Jun 2020 16:50:23 -0700 Subject: lib/flex_proportions.c: cleanup __fprop_inc_percpu_max If the given type has fraction smaller than max_frac/FPROP_FRAC_BASE, the code could be modified to call __fprop_inc_percpu() directly and easier to understand. After this patch, fprop_reflect_period_percpu() will be called twice, and quicky return on pl->period == p->period test, so it would not result to significant downside of performance. Thanks for Jan's guidance. Signed-off-by: Tan Hu Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Cc: Yi Wang Cc: Link: http://lkml.kernel.org/r/1589004753-27554-1-git-send-email-tan.hu@zte.com.cn Signed-off-by: Linus Torvalds --- lib/flex_proportions.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 7852bfff50b1..451543937524 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -266,8 +266,7 @@ void __fprop_inc_percpu_max(struct fprop_global *p, if (numerator > (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) return; - } else - fprop_reflect_period_percpu(p, pl); - percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&p->events, 1); + } + + __fprop_inc_percpu(p, pl); } -- cgit v1.2.3 From c348c16305280fe3e6c1186378f96c8634c149f9 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Thu, 4 Jun 2020 16:50:27 -0700 Subject: lib: make a test module with set/clear bit Test some bit clears/sets to make sure assembly doesn't change, and that the set_bit and clear_bit functions work and don't cause sparse warnings. Instruct Kbuild to build this file with extra warning level -Wextra, to catch new issues, and also doesn't hurt to build with C=1. This was used to test changes to arch/x86/include/asm/bitops.h. In particular, sparse (C=1) was very concerned when the last bit before a natural boundary, like 7, or 31, was being tested, as this causes sign extension (0xffffff7f) for instance when clearing bit 7. Recommended usage: make defconfig scripts/config -m CONFIG_TEST_BITOPS make modules_prepare make C=1 W=1 lib/test_bitops.ko objdump -S -d lib/test_bitops.ko insmod lib/test_bitops.ko rmmod lib/test_bitops.ko , there should be no compiler/sparse warnings and no error messages in log. Link: http://lkml.kernel.org/r/20200310221747.2848474-2-jesse.brandeburg@intel.com Signed-off-by: Jesse Brandeburg Reviewed-by: Andy Shevchenko Cc: Thomas Gleixner CcL Ingo Molnar Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: Rasmus Villemoes Cc: Dan Williams Cc: Peter Zijlstra Cc: Wei Yang Cc: Christian Brauner Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 13 +++++++++ lib/Makefile | 2 ++ lib/test_bitops.c | 60 ++++++++++++++++++++++++++++++++++++++ tools/testing/selftests/lib/config | 1 + 4 files changed, 76 insertions(+) create mode 100644 lib/test_bitops.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 30302c9b0f83..b9a450b1fbf7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2022,6 +2022,19 @@ config TEST_LKM If unsure, say N. +config TEST_BITOPS + tristate "Test module for compilation of clear_bit/set_bit operations" + depends on m + help + This builds the "test_bitops" module that is much like the + TEST_LKM module except that it does a basic exercise of the + clear_bit and set_bit macros to make sure there are no compiler + warnings from C=1 sparse checker or -Wextra compilations. It has + no dependencies and doesn't run or load unless explicitly requested + by name. for example: modprobe test_bitops. + + If unsure, say N. + config TEST_VMALLOC tristate "Test module for stress/performance analysis of vmalloc allocator" default n diff --git a/lib/Makefile b/lib/Makefile index 5adf8949a757..32f19b4d1d2a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -56,6 +56,8 @@ obj-y += kstrtox.o obj-$(CONFIG_FIND_BIT_BENCHMARK) += find_bit_benchmark.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_BITOPS) += test_bitops.o +CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_IDA) += test_ida.o diff --git a/lib/test_bitops.c b/lib/test_bitops.c new file mode 100644 index 000000000000..fd50b3ae4a14 --- /dev/null +++ b/lib/test_bitops.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Intel Corporation + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +/* a tiny module only meant to test set/clear_bit */ + +/* use an enum because thats the most common BITMAP usage */ +enum bitops_fun { + BITOPS_4 = 4, + BITOPS_7 = 7, + BITOPS_11 = 11, + BITOPS_31 = 31, + BITOPS_88 = 88, + BITOPS_LAST = 255, + BITOPS_LENGTH = 256 +}; + +static DECLARE_BITMAP(g_bitmap, BITOPS_LENGTH); + +static int __init test_bitops_startup(void) +{ + pr_warn("Loaded test module\n"); + set_bit(BITOPS_4, g_bitmap); + set_bit(BITOPS_7, g_bitmap); + set_bit(BITOPS_11, g_bitmap); + set_bit(BITOPS_31, g_bitmap); + set_bit(BITOPS_88, g_bitmap); + return 0; +} + +static void __exit test_bitops_unstartup(void) +{ + int bit_set; + + clear_bit(BITOPS_4, g_bitmap); + clear_bit(BITOPS_7, g_bitmap); + clear_bit(BITOPS_11, g_bitmap); + clear_bit(BITOPS_31, g_bitmap); + clear_bit(BITOPS_88, g_bitmap); + + bit_set = find_first_bit(g_bitmap, BITOPS_LAST); + if (bit_set != BITOPS_LAST) + pr_err("ERROR: FOUND SET BIT %d\n", bit_set); + + pr_warn("Unloaded test module\n"); +} + +module_init(test_bitops_startup); +module_exit(test_bitops_unstartup); + +MODULE_AUTHOR("Jesse Brandeburg "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Bit testing module"); diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 14a77ea4a8da..b80ee3f6e265 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m +CONFIG_TEST_BITOPS=m -- cgit v1.2.3 From bd93f003b7462ae39a43c531abca37fe7073b866 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Jun 2020 16:50:30 -0700 Subject: include/linux/bitops.h: avoid clang shift-count-overflow warnings Clang normally does not warn about certain issues in inline functions when it only happens in an eliminated code path. However if something else goes wrong, it does tend to complain about the definition of hweight_long() on 32-bit targets: include/linux/bitops.h:75:41: error: shift count >= width of type [-Werror,-Wshift-count-overflow] return sizeof(w) == 4 ? hweight32(w) : hweight64(w); ^~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:29:49: note: expanded from macro 'hweight64' define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w)) ^~~~~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:21:76: note: expanded from macro '__const_hweight64' define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) ^ ~~ include/asm-generic/bitops/const_hweight.h:20:49: note: expanded from macro '__const_hweight32' define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16)) ^ include/asm-generic/bitops/const_hweight.h:19:72: note: expanded from macro '__const_hweight16' define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8 )) ^ include/asm-generic/bitops/const_hweight.h:12:9: note: expanded from macro '__const_hweight8' (!!((w) & (1ULL << 2))) + \ Adding an explicit cast to __u64 avoids that warning and makes it easier to read other output. Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Acked-by: Christian Brauner Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Josh Poimboeuf Cc: Nick Desaulniers Link: http://lkml.kernel.org/r/20200505135513.65265-1-arnd@arndb.de Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9acf654f0b19..99f2ac30b1d9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -72,7 +72,7 @@ static inline int get_bitmask_order(unsigned int count) static __always_inline unsigned long hweight_long(unsigned long w) { - return sizeof(w) == 4 ? hweight32(w) : hweight64(w); + return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** -- cgit v1.2.3 From 7ccf41a89cb0c178dca31bce4836b8fed2694d71 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:50:33 -0700 Subject: checkpatch: additional MAINTAINER section entry ordering checks There is a preferred order for the entries in MAINTAINERS sections. See commits 3b50142d8528 ("MAINTAINERS: sort field names for all entries") and 6680125ea5a2 ("MAINTAINERS: list the section entries in the preferred order") Add checkpatch tests to try to keep that ordering. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Acked-by: Andy Shevchenko Link: http://lkml.kernel.org/r/17677130b3ca62d79817e6a22546bad39d7e81b4.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bf9e0e87a6ef..10d75da04947 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3062,14 +3062,43 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } -# check for MAINTAINERS entries that don't have the right form - if ($realfile =~ /^MAINTAINERS$/ && - $rawline =~ /^\+[A-Z]:/ && - $rawline !~ /^\+[A-Z]:\t\S/) { - if (WARN("MAINTAINERS_STYLE", - "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; +# check MAINTAINERS entries + if ($realfile =~ /^MAINTAINERS$/) { +# check MAINTAINERS entries for the right form + if ($rawline =~ /^\+[A-Z]:/ && + $rawline !~ /^\+[A-Z]:\t\S/) { + if (WARN("MAINTAINERS_STYLE", + "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; + } + } +# check MAINTAINERS entries for the right ordering too + my $preferred_order = 'MRLSWQBCPTFXNK'; + if ($rawline =~ /^\+[A-Z]:/ && + $prevrawline =~ /^[\+ ][A-Z]:/) { + $rawline =~ /^\+([A-Z]):\s*(.*)/; + my $cur = $1; + my $curval = $2; + $prevrawline =~ /^[\+ ]([A-Z]):\s*(.*)/; + my $prev = $1; + my $prevval = $2; + my $curindex = index($preferred_order, $cur); + my $previndex = index($preferred_order, $prev); + if ($curindex < 0) { + WARN("MAINTAINERS_STYLE", + "Unknown MAINTAINERS entry type: '$cur'\n" . $herecurr); + } else { + if ($previndex >= 0 && $curindex < $previndex) { + WARN("MAINTAINERS_STYLE", + "Misordered MAINTAINERS entry - list '$cur:' before '$prev:'\n" . $hereprev); + } elsif ((($prev eq 'F' && $cur eq 'F') || + ($prev eq 'X' && $cur eq 'X')) && + ($prevval cmp $curval) > 0) { + WARN("MAINTAINERS_STYLE", + "Misordered MAINTAINERS entry - list file patterns in alphabetic order\n" . $hereprev); + } + } } } -- cgit v1.2.3 From a55ee0cc09a4e3ee6c4443afdbff53639672178f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:50:36 -0700 Subject: checkpatch: look for c99 comments in ctx_locate_comment Some checks look for comments around a specific function like read_barrier_depends. Extend the check to support both c89 and c90 comment styles. c89 /* comment */ or c99 // comment For c99 comments, only look a 3 single lines, the line being scanned, the line above and the line below the line being scanned rather than the patch diff context. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Tested-by: Paul E. McKenney Cc: Marco Elver Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andy Whitcroft Cc: Will Deacon Link: http://lkml.kernel.org/r/65cb075435d2f385a53c77571b491b2b09faaf8e.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 10d75da04947..880fe8639fb6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1676,8 +1676,16 @@ sub ctx_statement_level { sub ctx_locate_comment { my ($first_line, $end_line) = @_; + # If c99 comment on the current line, or the line before or after + my ($current_comment) = ($rawlines[$end_line - 1] =~ m@^\+.*(//.*$)@); + return $current_comment if (defined $current_comment); + ($current_comment) = ($rawlines[$end_line - 2] =~ m@^[\+ ].*(//.*$)@); + return $current_comment if (defined $current_comment); + ($current_comment) = ($rawlines[$end_line] =~ m@^[\+ ].*(//.*$)@); + return $current_comment if (defined $current_comment); + # Catch a comment on the end of the line itself. - my ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); + ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); return $current_comment if (defined $current_comment); # Look through the context and try and figure out if there is a -- cgit v1.2.3 From 32f30ca9f19df77eced0ec029ce9dcfb24ff045b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:50:40 -0700 Subject: checkpatch: disallow --git and --file/--fix Don't allow these options to be combined. Miscellanea: o Add missing $P: to some die("reason message") output Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/3dc7bdaa58490f5906efc11a4d6113e42a087723.camel@perches.com Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 880fe8639fb6..f1e925d36b26 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -246,6 +246,8 @@ list_types(0) if ($list_types); $fix = 1 if ($fix_inplace); $check_orig = $check; +die "$P: --git cannot be used with --file or --fix\n" if ($git && ($file || $fix)); + my $exit = 0; my $perl_version_ok = 1; @@ -269,11 +271,11 @@ if ($color =~ /^[01]$/) { } elsif ($color =~ /^auto$/i) { $color = (-t STDOUT); } else { - die "Invalid color mode: $color\n"; + die "$P: Invalid color mode: $color\n"; } # skip TAB size 1 to avoid additional checks on $tabsize - 1 -die "Invalid TAB size: $tabsize\n" if ($tabsize < 2); +die "$P: Invalid TAB size: $tabsize\n" if ($tabsize < 2); sub hash_save_array_words { my ($hashRef, $arrayRef) = @_; -- cgit v1.2.3 From c7f574d0e9f9e8c3655b6bb06e69e37d341956d3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 4 Jun 2020 16:50:43 -0700 Subject: checkpatch: use patch subject when reading from stdin While "git am" can apply an mbox file containing multiple patches (e.g. as created by b4[1], or a patch bundle downloaded from patchwork), checkpatch does not have proper support for that. When operating on an mbox, checkpatch will merge all detected tags, and complain falsely about duplicates: WARNING: Duplicate signature As modifying checkpatch to reset state in between each patch is a lot of work, a simple solution is splitting the mbox into individual patches, and invoking checkpatch for each of them. Fortunately checkpatch can read a patch from stdin, so the classic "formail" tool can be used to split the mbox, and pipe all individual patches to checkpatch: formail -s scripts/checkpatch.pl < my-mbox However, when reading a patch file from standard input, checkpatch calls it "Your patch", and reports its state as: Your patch has style problems, please review. or: Your patch has no obvious style problems and is ready for submission. Hence it can be difficult to identify which patches need to be reviewed and improved. Fix this by replacing "Your patch" by (the first line of) the email subject, if present. Note that "git mailsplit" can also be used to split an mbox, but it will create individual files for each patch, thus requiring cleanup afterwards. Formail does not have this disadvantage. [1] https://git.kernel.org/pub/scm/utils/b4/b4.git [joe@perches.com: reduce cpu usage] Link: http://lkml.kernel.org/r/c9d89bb24c7414142414c60371e210fdcf4617d2.camel@perches.com Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Cc: Joe Perches Cc: Konstantin Ryabitsev Link: http://lkml.kernel.org/r/20200505132613.17452-1-geert+renesas@glider.be Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f1e925d36b26..10a0e035a787 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1062,6 +1062,7 @@ for my $filename (@ARGV) { while (<$FILE>) { chomp; push(@rawlines, $_); + $vname = qq("$1") if ($filename eq '-' && $_ =~ m/^Subject:\s+(.+)/i); } close($FILE); -- cgit v1.2.3 From 852991dd3a7374a15b21b904117272f57939463c Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Thu, 4 Jun 2020 16:50:46 -0700 Subject: fs/binfmt_elf: remove redundant elf_map ifndef The ifndef was added a long time ago to support archs that would define their own mapping function. The last user was the metag arch which was removed from the tree, and as such there are no users left. Let's kill it. Signed-off-by: Anthony Iliopoulos Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200402161543.4119-1-ailiop@suse.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8945671fe0e5..91b09a105a6d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -353,8 +353,6 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, return 0; } -#ifndef elf_map - static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -394,8 +392,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -#endif /* !elf_map */ - static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; -- cgit v1.2.3 From 51da9dfb7f20911ae4e79e9b412a9c2d4c373d4b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 4 Jun 2020 16:50:49 -0700 Subject: elfnote: mark all .note sections SHF_ALLOC ELFNOTE_START allows callers to specify flags for .pushsection assembler directives. All callsites but ELF_NOTE use "a" for SHF_ALLOC. For vdso's that explicitly use ELF_NOTE_START and BUILD_SALT, the same section is specified twice after preprocessing, once with "a" flag, once without. Example: .pushsection .note.Linux, "a", @note ; .pushsection .note.Linux, "", @note ; While GNU as allows this ordering, it warns for the opposite ordering, making these directives position dependent. We'd prefer not to precisely match this behavior in Clang's integrated assembler. Instead, the non __ASSEMBLY__ definition of ELF_NOTE uses __attribute__((section(".note.Linux"))) which is created with SHF_ALLOC, so let's make the __ASSEMBLY__ definition of ELF_NOTE consistent with C and just always use "a" flag. This allows Clang to assemble a working mainline (5.6) kernel via: $ make CC=clang AS=clang Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Reviewed-by: Nathan Chancellor Reviewed-by: Fangrui Song Cc: Jeremy Fitzhardinge Cc: Thomas Gleixner Cc: Vincenzo Frascino Link: https://github.com/ClangBuiltLinux/linux/issues/913 Link: http://lkml.kernel.org/r/20200325231250.99205-1-ndesaulniers@google.com Debugged-by: Ilie Halip Signed-off-by: Linus Torvalds --- include/linux/elfnote.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 594d4e78654f..69b136e4dd2b 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -54,7 +54,7 @@ .popsection ; #define ELFNOTE(name, type, desc) \ - ELFNOTE_START(name, type, "") \ + ELFNOTE_START(name, type, "a") \ desc ; \ ELFNOTE_END -- cgit v1.2.3 From ada4ab7af1a67db7eb8596709071a97afb2bf7de Mon Sep 17 00:00:00 2001 From: Chris Down Date: Thu, 4 Jun 2020 16:50:53 -0700 Subject: init: allow distribution configuration of default init Some init systems (eg. systemd) have init at their own paths, for example, /usr/lib/systemd/systemd. A compatibility symlink to one of the hardcoded init paths is provided by another package, usually named something like systemd-sysvcompat or similar. Currently distro maintainers who are hands-off on the bootloader are more or less required to include those compatibility links as part of their base distribution, because it's hard to migrate away from them since there's a risk some users will not get the message to set init= on the kernel command line appropriately. Moreover, for distributions where the init system is something the distribution itself is opinionated about (eg. Arch, which has systemd in the required `base` package), we could usually reasonably configure this ahead of time when building the distribution kernel. However, we currently simply don't have any way to configure the kernel to do this. Here's an example discussion where removing sysvcompat was discussed by distro maintainers[0]. This patch adds a new Kconfig tunable, CONFIG_DEFAULT_INIT, which if set is tried before the hardcoded fallback list. So the order of precedence is now thus: 1. init= on command line (on failure: panic) 2. CONFIG_DEFAULT_INIT (on failure: try #3) 3. Hardcoded fallback list (on failure: panic) This new config parameter will allow distribution maintainers to move away from these compatibility links safely, without having to worry that their users might not have the right init=. There are also two other benefits of this over having the distribution maintain a symlink: 1. One of the value propositions over simply having distributions maintain a /sbin/init symlink via a package is that it also frees distributions which have a preferred default, but not mandatory, init system from having their package manager fight with their users for control of /{s,}bin/init. Instead, the distribution simply makes their preference known in CONFIG_DEFAULT_INIT, and if the user installs another init system and uninstalls the default one they can still make use of /{s,}bin/init and friends for their own uses. This makes more cases Just Work(tm) without the user having to perform extra configuration via init=. 2. Since before this we don't know which path the distribution actually _intends_ to serve init from, we don't pr_err if it is simply missing, and usually will just silently put the user in a /bin/sh shell. Now that the distribution can make a declaration of intent, we can be more vocal when this init system fails to launch for any reason, even if it's simply because no file exists at that location, speeding up the palaver of init/mount dependency/etc debugging a bit. [0]: https://lists.archlinux.org/pipermail/arch-dev-public/2019-January/029435.html Signed-off-by: Chris Down Signed-off-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: Masami Hiramatsu Link: http://lkml.kernel.org/r/20200522160234.GA1487022@chrisdown.name Signed-off-by: Linus Torvalds --- init/Kconfig | 10 ++++++++++ init/main.c | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index fdb4f52609c6..2d12d38cdd88 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -260,6 +260,16 @@ config KERNEL_UNCOMPRESSED endchoice +config DEFAULT_INIT + string "Default init path" + default "" + help + This option determines the default init for the system if no init= + option is passed on the kernel command line. If the requested path is + not present, we will still then move on to attempting further + locations (e.g. /sbin/init, etc). If this is empty, we will just use + the fallback list when init= is not passed. + config DEFAULT_HOSTNAME string "Default hostname" default "(none)" diff --git a/init/main.c b/init/main.c index df32f67214d2..76df62fc3e2c 100644 --- a/init/main.c +++ b/init/main.c @@ -1433,6 +1433,16 @@ static int __ref kernel_init(void *unused) panic("Requested init %s failed (error %d).", execute_command, ret); } + + if (CONFIG_DEFAULT_INIT[0] != '\0') { + ret = run_init_process(CONFIG_DEFAULT_INIT); + if (ret) + pr_err("Default init %s failed (error %d)\n", + CONFIG_DEFAULT_INIT, ret); + else + return 0; + } + if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || -- cgit v1.2.3 From b1b65750b8db67834482f758fc385bfa7560d228 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 4 Jun 2020 16:50:56 -0700 Subject: fat: don't allow to mount if the FAT length == 0 If FAT length == 0, the image doesn't have any data. And it can be the cause of overlapping the root dir and FAT entries. Also Windows treats it as invalid format. Reported-by: syzbot+6f1624f937d9d6911e2d@syzkaller.appspotmail.com Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Cc: Marco Elver Cc: Dmitry Vyukov Link: http://lkml.kernel.org/r/87r1wz8mrd.fsf@mail.parknet.co.jp Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index e6e68b2274a5..a0cf99debb1e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1519,6 +1519,12 @@ static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, goto out; } + if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT sectors"); + goto out; + } + error = 0; out: -- cgit v1.2.3 From 898310032b96c198014a8bbace0fd26259b2db77 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 4 Jun 2020 16:50:59 -0700 Subject: fat: improve the readahead for FAT entries Current readahead for FAT entries is very simple but is having some flaws, so it is not working well for some environments. This patch improves the readahead more or less. The key points of modification are, - make the readahead size tunable by using bdi->ra_pages - care the bdi->io_pages to avoid the small size I/O request - update readahead window before fully exhausting With this patch, on slow USB connected 2TB hdd: [before] 383.18sec [after] 51.03sec Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Tested-by: hyeongseok.kim Reviewed-by: hyeongseok.kim Link: http://lkml.kernel.org/r/87d08e1dlh.fsf@mail.parknet.co.jp Signed-off-by: Linus Torvalds --- fs/fat/fatent.c | 103 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 3647c65a0f48..bbfe18c07417 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -632,20 +632,80 @@ error: } EXPORT_SYMBOL_GPL(fat_free_clusters); -/* 128kb is the whole sectors for FAT12 and FAT16 */ -#define FAT_READA_SIZE (128 * 1024) +struct fatent_ra { + sector_t cur; + sector_t limit; + + unsigned int ra_blocks; + sector_t ra_advance; + sector_t ra_next; + sector_t ra_limit; +}; -static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, - unsigned long reada_blocks) +static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, + struct fat_entry *fatent, int ent_limit) { - const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; - sector_t blocknr; - int i, offset; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + sector_t blocknr, block_end; + int offset; + /* + * This is the sequential read, so ra_pages * 2 (but try to + * align the optimal hardware IO size). + * [BTW, 128kb covers the whole sectors for FAT12 and FAT16] + */ + unsigned long ra_pages = sb->s_bdi->ra_pages; + unsigned int reada_blocks; + if (ra_pages > sb->s_bdi->io_pages) + ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); + reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); + + /* Initialize the range for sequential read */ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end); + ra->cur = 0; + ra->limit = (block_end + 1) - blocknr; - for (i = 0; i < reada_blocks; i++) - sb_breadahead(sb, blocknr + i); + /* Advancing the window at half size */ + ra->ra_blocks = reada_blocks >> 1; + ra->ra_advance = ra->cur; + ra->ra_next = ra->cur; + ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit); +} + +/* Assuming to be called before reading a new block (increments ->cur). */ +static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra, + struct fat_entry *fatent) +{ + if (ra->ra_next >= ra->ra_limit) + return; + + if (ra->cur >= ra->ra_advance) { + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + struct blk_plug plug; + sector_t blocknr, diff; + int offset; + + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + + diff = blocknr - ra->cur; + blk_start_plug(&plug); + /* + * FIXME: we would want to directly use the bio with + * pages to reduce the number of segments. + */ + for (; ra->ra_next < ra->ra_limit; ra->ra_next++) + sb_breadahead(sb, ra->ra_next + diff); + blk_finish_plug(&plug); + + /* Advance the readahead window */ + ra->ra_advance += ra->ra_blocks; + ra->ra_limit += min_t(sector_t, + ra->ra_blocks, ra->limit - ra->ra_limit); + } + ra->cur++; } int fat_count_free_clusters(struct super_block *sb) @@ -653,27 +713,20 @@ int fat_count_free_clusters(struct super_block *sb) struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; - unsigned long reada_blocks, reada_mask, cur_block; + struct fatent_ra fatent_ra; int err = 0, free; lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; - reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; - reada_mask = reada_blocks - 1; - cur_block = 0; - free = 0; fatent_init(&fatent); fatent_set_entry(&fatent, FAT_START_ENT); + fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster); while (fatent.entry < sbi->max_cluster) { /* readahead of fat blocks */ - if ((cur_block & reada_mask) == 0) { - unsigned long rest = sbi->fat_length - cur_block; - fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); - } - cur_block++; + fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) @@ -707,9 +760,9 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; + struct fatent_ra fatent_ra; u64 ent_start, ent_end, minlen, trimmed = 0; u32 free = 0; - unsigned long reada_blocks, reada_mask, cur_block = 0; int err = 0; /* @@ -727,19 +780,13 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) if (ent_end >= sbi->max_cluster) ent_end = sbi->max_cluster - 1; - reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; - reada_mask = reada_blocks - 1; - fatent_init(&fatent); lock_fat(sbi); fatent_set_entry(&fatent, ent_start); + fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1); while (fatent.entry <= ent_end) { /* readahead of fat blocks */ - if ((cur_block & reada_mask) == 0) { - unsigned long rest = sbi->fat_length - cur_block; - fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); - } - cur_block++; + fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) -- cgit v1.2.3 From a3963015787d0a3bd9c2d91b62de5ca70413dc3b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 4 Jun 2020 16:51:02 -0700 Subject: fs/seq_file.c: seq_read: Update pr_info_ratelimited Use a more common logging style. Add and use pr_fmt, coalesce the format string, align arguments, use better grammar. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Cc: Vasily Averin Link: http://lkml.kernel.org/r/96ff603230ca1bd60034c36519be3930c3a3a226.camel@perches.com Signed-off-by: Linus Torvalds --- fs/seq_file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 70f5fdf99bf6..4e6239f33c06 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,8 @@ * initial implementation -- AV, Oct 2001. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -233,9 +235,8 @@ Fill: p = m->op->next(m, p, &m->index); if (pos == m->index) { - pr_info_ratelimited("buggy seq_file .next function %ps " - "did not updated position index\n", - m->op->next); + pr_info_ratelimited("buggy .next function %ps did not update position index\n", + m->op->next); m->index++; } if (!p || IS_ERR(p)) { -- cgit v1.2.3 From d2c0e6e91c7990c67921005f44f9b2b326ff2906 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 4 Jun 2020 16:51:05 -0700 Subject: include/linux/seq_file.h: introduce DEFINE_SEQ_ATTRIBUTE() helper macro Patch series "seq_file: Introduce DEFINE_SEQ_ATTRIBUTE() helper macro". As discussed in https://lore.kernel.org/lkml/20191129222310.GA3712618@kroah.com/, we could introduce a new helper macro to reduce losts of boilerplate code, vmstat and kprobes is the example which covert to use it, if this is accepted, I will send out more cleanups. This patch (of 3): Introduce DEFINE_SEQ_ATTRIBUTE() helper macro to decrease code duplication. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Cc: Greg KH Cc: Ingo Molnar Cc: Kefeng Wang Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Al Viro Link: http://lkml.kernel.org/r/20200509064031.181091-1-wangkefeng.wang@huawei.com Link: http://lkml.kernel.org/r/20200509064031.181091-2-wangkefeng.wang@huawei.com Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 1672cf6f7614..813614d4b71f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -145,6 +145,25 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); +#define DEFINE_SEQ_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + int ret = seq_open(file, &__name ## _sops); \ + if (!ret && inode->i_private) { \ + struct seq_file *seq_f = file->private_data; \ + seq_f->private = inode->i_private; \ + } \ + return ret; \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release, \ +} + #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ -- cgit v1.2.3 From 01a995600275779d7b06c3b82fd50f6cb50c601c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 4 Jun 2020 16:51:08 -0700 Subject: mm/vmstat.c: convert to use DEFINE_SEQ_ATTRIBUTE macro Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code. Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Greg KH Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Link: http://lkml.kernel.org/r/20200509064031.181091-3-wangkefeng.wang@huawei.com Signed-off-by: Linus Torvalds --- mm/vmstat.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index a7db29f7e5f7..3fb23a21f6dd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2069,24 +2069,14 @@ static int unusable_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations unusable_op = { +static const struct seq_operations unusable_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = unusable_show, }; -static int unusable_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &unusable_op); -} - -static const struct file_operations unusable_file_ops = { - .open = unusable_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(unusable); static void extfrag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -2121,24 +2111,14 @@ static int extfrag_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations extfrag_op = { +static const struct seq_operations extfrag_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = extfrag_show, }; -static int extfrag_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &extfrag_op); -} - -static const struct file_operations extfrag_file_ops = { - .open = extfrag_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(extfrag); static int __init extfrag_debug_init(void) { @@ -2147,10 +2127,10 @@ static int __init extfrag_debug_init(void) extfrag_debug_root = debugfs_create_dir("extfrag", NULL); debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, - &unusable_file_ops); + &unusable_fops); debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, - &extfrag_file_ops); + &extfrag_fops); return 0; } -- cgit v1.2.3 From eac2cece45074e372f78a459c7bb2d7207b72736 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 4 Jun 2020 16:51:11 -0700 Subject: kernel/kprobes.c: convert to use DEFINE_SEQ_ATTRIBUTE macro Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code. Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Greg KH Cc: Ingo Molnar Cc: Al Viro Link: http://lkml.kernel.org/r/20200509064031.181091-4-wangkefeng.wang@huawei.com Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0fbdee78266b..50cd84f53df0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) @@ -2529,24 +2519,13 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) mutex_unlock(&kprobe_mutex); } -static const struct seq_operations kprobe_blacklist_seq_ops = { +static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; - -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_blacklist_seq_ops); -} - -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2705,13 +2684,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } -- cgit v1.2.3 From 986db2d14a6dca6456b63b4f5c410ae2aab4ec9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 16:51:14 -0700 Subject: exec: simplify the copy_strings_kernel calling convention copy_strings_kernel is always used with a single argument, adjust the calling convention to that. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200501104105.2621149-2-hch@lst.de Signed-off-by: Linus Torvalds --- fs/binfmt_em86.c | 6 +++--- fs/binfmt_misc.c | 4 ++-- fs/binfmt_script.c | 6 +++--- fs/exec.c | 13 ++++++------- include/linux/binfmts.h | 3 +-- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 466497860c62..f33fa668c91f 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -68,15 +68,15 @@ static int load_em86(struct linux_binprm *bprm) * user environment and arguments are stored. */ remove_arg_zero(bprm); - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) return retval; bprm->argc++; if (i_arg) { - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval < 0) return retval; bprm->argc++; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index cdb45829354d..b15257d8ff5e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -190,13 +190,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->file = NULL; } /* make argv[1] be the path to the binary */ - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto error; bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &fmt->interpreter, bprm); + retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto error; bprm->argc++; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index e9e6a6f4a35f..c4fb7f52a46e 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -117,17 +117,17 @@ static int load_script(struct linux_binprm *bprm) retval = remove_arg_zero(bprm); if (retval) return retval; - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) return retval; bprm->argc++; if (i_arg) { - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval) return retval; bprm->argc++; diff --git a/fs/exec.c b/fs/exec.c index 2c465119affc..4814b26a56fb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -588,24 +588,23 @@ out: } /* - * Like copy_strings, but get argv and its values from kernel memory. + * Copy and argument/environment string from the kernel to the processes stack. */ -int copy_strings_kernel(int argc, const char *const *__argv, - struct linux_binprm *bprm) +int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int r; mm_segment_t oldfs = get_fs(); struct user_arg_ptr argv = { - .ptr.native = (const char __user *const __user *)__argv, + .ptr.native = (const char __user *const __user *)&arg, }; set_fs(KERNEL_DS); - r = copy_strings(argc, argv, bprm); + r = copy_strings(1, argv, bprm); set_fs(oldfs); return r; } -EXPORT_SYMBOL(copy_strings_kernel); +EXPORT_SYMBOL(copy_string_kernel); #ifdef CONFIG_MMU @@ -1865,7 +1864,7 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval < 0) goto out; - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index a345d9fed3d8..3d3afe094c97 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -144,8 +144,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, extern int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location); extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); -extern int copy_strings_kernel(int argc, const char *const *argv, - struct linux_binprm *bprm); +int copy_string_kernel(const char *arg, struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); -- cgit v1.2.3 From 762a3af6faa0682e5b30b67b1db156c7df55f2c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 16:51:18 -0700 Subject: exec: open code copy_string_kernel Currently copy_string_kernel is just a wrapper around copy_strings that simplifies the calling conventions and uses set_fs to allow passing a kernel pointer. But due to the fact the we only need to handle a single kernel argument pointer, the logic can be sigificantly simplified while getting rid of the set_fs. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200501104105.2621149-3-hch@lst.de Signed-off-by: Linus Torvalds --- fs/exec.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 4814b26a56fb..03a85e3c49bb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -592,17 +592,42 @@ out: */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { - int r; - mm_segment_t oldfs = get_fs(); - struct user_arg_ptr argv = { - .ptr.native = (const char __user *const __user *)&arg, - }; + int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; + unsigned long pos = bprm->p; + + if (len == 0) + return -EFAULT; + if (!valid_arg_len(bprm, len)) + return -E2BIG; + + /* We're going to work our way backwards. */ + arg += len; + bprm->p -= len; + if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) + return -E2BIG; + + while (len > 0) { + unsigned int bytes_to_copy = min_t(unsigned int, len, + min_not_zero(offset_in_page(pos), PAGE_SIZE)); + struct page *page; + char *kaddr; - set_fs(KERNEL_DS); - r = copy_strings(1, argv, bprm); - set_fs(oldfs); + pos -= bytes_to_copy; + arg -= bytes_to_copy; + len -= bytes_to_copy; - return r; + page = get_arg_page(bprm, pos, 1); + if (!page) + return -E2BIG; + kaddr = kmap_atomic(page); + flush_arg_page(bprm, pos & PAGE_MASK, page); + memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); + flush_kernel_dcache_page(page); + kunmap_atomic(kaddr); + put_arg_page(page); + } + + return 0; } EXPORT_SYMBOL(copy_string_kernel); -- cgit v1.2.3 From e1c3cdb26ab881b77486dc50370356a349077c74 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 4 Jun 2020 16:51:21 -0700 Subject: rapidio: avoid data race between file operation callbacks and mport_cdev_add(). Fields of md(mport_dev) are set after cdev_device_add(). However, the file operation callbacks can be called after cdev_device_add() and therefore accesses to fields of md in the callbacks can race with the rest of the mport_cdev_add() function. One such example is INIT_LIST_HEAD(&md->portwrites) in mport_cdev_add(), the list is initialised after cdev_device_add(). This can race with list_add_tail(&pw_filter->md_node,&md->portwrites) in rio_mport_add_pw_filter() which is called by unlocked_ioctl. To avoid such data races use cdev_device_add() after initializing md. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Madhuparna Bhowmik Signed-off-by: Andrew Morton Acked-by: Alexandre Bounine Cc: Matt Porter Cc: Dan Carpenter Cc: Mike Marshall Cc: Thomas Gleixner Cc: Ira Weiny Cc: Allison Randal Cc: Pavel Andrianov Link: http://lkml.kernel.org/r/20200426112950.1803-1-madhuparnabhowmik10@gmail.com Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 10af330153b5..0b85a80ae7ef 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -2384,13 +2384,6 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) cdev_init(&md->cdev, &mport_fops); md->cdev.owner = THIS_MODULE; - ret = cdev_device_add(&md->cdev, &md->dev); - if (ret) { - rmcd_error("Failed to register mport %d (err=%d)", - mport->id, ret); - goto err_cdev; - } - INIT_LIST_HEAD(&md->doorbells); spin_lock_init(&md->db_lock); INIT_LIST_HEAD(&md->portwrites); @@ -2410,6 +2403,13 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) #else md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER; #endif + + ret = cdev_device_add(&md->cdev, &md->dev); + if (ret) { + rmcd_error("Failed to register mport %d (err=%d)", + mport->id, ret); + goto err_cdev; + } ret = rio_query_mport(mport, &attr); if (!ret) { md->properties.flags = attr.flags; -- cgit v1.2.3 From 67446283d89467edae5ad9f632ee31f0fbae35fe Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 16:51:24 -0700 Subject: rapidio: convert get_user_pages() --> pin_user_pages() This code was using get_user_pages_fast(), in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages_fast() + put_page() calls to pin_user_pages_fast() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Matt Porter Cc: Alexandre Bounine Cc: Sumit Semwal Cc: Dan Carpenter Link: http://lkml.kernel.org/r/20200517235620.205225-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 0b85a80ae7ef..451608e960a1 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -572,14 +572,12 @@ static void dma_req_free(struct kref *ref) struct mport_dma_req *req = container_of(ref, struct mport_dma_req, refcount); struct mport_cdev_priv *priv = req->priv; - unsigned int i; dma_unmap_sg(req->dmach->device->dev, req->sgt.sgl, req->sgt.nents, req->dir); sg_free_table(&req->sgt); if (req->page_list) { - for (i = 0; i < req->nr_pages; i++) - put_page(req->page_list[i]); + unpin_user_pages(req->page_list, req->nr_pages); kfree(req->page_list); } @@ -815,7 +813,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, struct mport_dma_req *req; struct mport_dev *md = priv->md; struct dma_chan *chan; - int i, ret; + int ret; int nents; if (xfer->length == 0) @@ -862,7 +860,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, goto err_req; } - pinned = get_user_pages_fast( + pinned = pin_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, nr_pages, dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, @@ -870,7 +868,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (pinned != nr_pages) { if (pinned < 0) { - rmcd_error("get_user_pages_unlocked err=%ld", + rmcd_error("pin_user_pages_fast err=%ld", pinned); nr_pages = 0; } else @@ -951,8 +949,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, err_pg: if (!req->page_list) { - for (i = 0; i < nr_pages; i++) - put_page(page_list[i]); + unpin_user_pages(page_list, nr_pages); kfree(page_list); } err_req: -- cgit v1.2.3 From 54e200ab40fc14c863bcc80a51e20b7906608fce Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 4 Jun 2020 16:51:27 -0700 Subject: kernel/relay.c: handle alloc_percpu returning NULL in relay_open alloc_percpu() may return NULL, which means chan->buf may be set to NULL. In that case, when we do *per_cpu_ptr(chan->buf, ...), we dereference an invalid pointer: BUG: Unable to handle kernel data access at 0x7dae0000 Faulting instruction address: 0xc0000000003f3fec ... NIP relay_open+0x29c/0x600 LR relay_open+0x270/0x600 Call Trace: relay_open+0x264/0x600 (unreliable) __blk_trace_setup+0x254/0x600 blk_trace_setup+0x68/0xa0 sg_ioctl+0x7bc/0x2e80 do_vfs_ioctl+0x13c/0x1300 ksys_ioctl+0x94/0x130 sys_ioctl+0x48/0xb0 system_call+0x5c/0x68 Check if alloc_percpu returns NULL. This was found by syzkaller both on x86 and powerpc, and the reproducer it found on powerpc is capable of hitting the issue as an unprivileged user. Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: syzbot+1e925b4b836afe85a1c6@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+587b2421926808309d21@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+58320b7171734bf79d26@syzkaller.appspotmail.com Reported-by: syzbot+d6074fb08bdb2e010520@syzkaller.appspotmail.com Signed-off-by: Daniel Axtens Signed-off-by: Andrew Morton Reviewed-by: Michael Ellerman Reviewed-by: Andrew Donnellan Acked-by: David Rientjes Cc: Akash Goel Cc: Andrew Donnellan Cc: Guenter Roeck Cc: Salvatore Bonaccorso Cc: [4.10+] Link: http://lkml.kernel.org/r/20191219121256.26480-1-dja@axtens.net Signed-off-by: Linus Torvalds --- kernel/relay.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/relay.c b/kernel/relay.c index 90c7a002436d..dc82705e1cff 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; -- cgit v1.2.3 From 341a7213e5c1ce274cc0f02270054905800ea660 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Thu, 4 Jun 2020 16:51:30 -0700 Subject: kernel/relay.c: fix read_pos error when multiple readers When reading, read_pos should start with bytes_consumed, not file->f_pos. Because when there is more than one reader, the read_pos corresponding to file->f_pos may have been consumed, which will cause the data that has been consumed to be read and the bytes_consumed update error. Signed-off-by: Pengcheng Yang Signed-off-by: Andrew Morton Reviewed-by: Jens Axboe Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Al Viro e Link: http://lkml.kernel.org/r/1579691175-28949-1-git-send-email-yangpc@wangsu.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index dc82705e1cff..204867220f8a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -996,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1064,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1136,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; -- cgit v1.2.3 From 804eb64615a405b3765fb3618bc07d7d95809add Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:51:34 -0700 Subject: selftests/x86/pkeys: move selftests to arch-neutral directory Patch series "selftests, powerpc, x86: Memory Protection Keys", v19. Memory protection keys enables an application to protect its address space from inadvertent access by its own code. This feature is now enabled on powerpc and has been available since 4.16-rc1. The patches move the selftests to arch neutral directory and enhance their test coverage. Tested on powerpc64 and x86_64 (Skylake-SP). This patch (of 24): Move selftest files from tools/testing/selftests/x86/ to tools/testing/selftests/vm/. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Ingo Molnar Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Michal Hocko Cc: "Aneesh Kumar K.V" Cc: Michal Suchanek Cc: Michael Ellerman Cc: Shuah Khan Link: http://lkml.kernel.org/r/14d25194c3e2e652e0047feec4487e269e76e8c9.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/pkey-helpers.h | 219 ++++ tools/testing/selftests/vm/protection_keys.c | 1506 +++++++++++++++++++++++++ tools/testing/selftests/x86/.gitignore | 1 - tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/pkey-helpers.h | 219 ---- tools/testing/selftests/x86/protection_keys.c | 1506 ------------------------- 8 files changed, 1728 insertions(+), 1727 deletions(-) create mode 100644 tools/testing/selftests/vm/pkey-helpers.h create mode 100644 tools/testing/selftests/vm/protection_keys.c delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h delete mode 100644 tools/testing/selftests/x86/protection_keys.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 8df6a074e370..849e8226395a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -10,6 +10,7 @@ mlock2-tests mremap_dontunmap on-fault-limit transhuge-stress +protection_keys userfaultfd mlock-intersect-test mlock-random-test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 9f18440080ef..c3b559ea97c5 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -15,6 +15,7 @@ TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += protection_keys TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h new file mode 100644 index 000000000000..254e5436bdd9 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NR_PKEYS 16 +#define PKRU_BITS_PER_PKEY 2 + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern unsigned int shadow_pkru; +static inline unsigned int __rdpkru(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned int pkru; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkru = eax; + return pkru; +} + +static inline unsigned int _rdpkru(int line) +{ + unsigned int pkru = __rdpkru(); + + dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", + line, pkru, shadow_pkru); + assert(pkru == shadow_pkru); + + return pkru; +} + +#define rdpkru() _rdpkru(__LINE__) + +static inline void __wrpkru(unsigned int pkru) +{ + unsigned int eax = pkru; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkru == __rdpkru()); +} + +static inline void wrpkru(unsigned int pkru) +{ + dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + /* will do the shadow check for us: */ + rdpkru(); + __wrpkru(pkru); + shadow_pkru = pkru; + dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); +} + +/* + * These are technically racy. since something could + * change PKRU between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + unsigned int pkru = rdpkru(); + int bit = pkey * 2; + + if (do_allow) + pkru &= (1<mmap (see exit_mmap()), so make sure it is immune to pkeys + * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel + * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks + * + * Compile like this: + * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pkey-helpers.h" + +int iteration_nr = 1; +int test_nr; + +unsigned int shadow_pkru; + +#define HPAGE_SIZE (1UL<<21) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +int dprint_in_signal; +char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +void cat_into_file(char *str, char *file) +{ + int fd = open(file, O_RDWR); + int ret; + + dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); + /* + * these need to be raw because they are called under + * pkey_assert() + */ + if (fd < 0) { + fprintf(stderr, "error opening '%s'\n", str); + perror("error: "); + exit(__LINE__); + } + + ret = write(fd, str, strlen(str)); + if (ret != strlen(str)) { + perror("write to file failed"); + fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); + exit(__LINE__); + } + close(fd); +} + +#if CONTROL_TRACING > 0 +static int warned_tracing; +int tracing_root_ok(void) +{ + if (geteuid() != 0) { + if (!warned_tracing) + fprintf(stderr, "WARNING: not run as root, " + "can not do tracing control\n"); + warned_tracing = 1; + return 0; + } + return 1; +} +#endif + +void tracing_on(void) +{ +#if CONTROL_TRACING > 0 +#define TRACEDIR "/sys/kernel/debug/tracing" + char pidstr[32]; + + if (!tracing_root_ok()) + return; + + sprintf(pidstr, "%d", getpid()); + cat_into_file("0", TRACEDIR "/tracing_on"); + cat_into_file("\n", TRACEDIR "/trace"); + if (1) { + cat_into_file("function_graph", TRACEDIR "/current_tracer"); + cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); + } else { + cat_into_file("nop", TRACEDIR "/current_tracer"); + } + cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); + cat_into_file("1", TRACEDIR "/tracing_on"); + dprintf1("enabled tracing\n"); +#endif +} + +void tracing_off(void) +{ +#if CONTROL_TRACING > 0 + if (!tracing_root_ok()) + return; + cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); +#endif +} + +void abort_hooks(void) +{ + fprintf(stderr, "running %s()...\n", __func__); + tracing_off(); +#ifdef SLEEP_ON_ABORT + sleep(SLEEP_ON_ABORT); +#endif +} + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +/* + * This attempts to have roughly a page of instructions followed by a few + * instructions that do a write, and another page of instructions. That + * way, we are pretty sure that the write is in the second page of + * instructions and has at least a page of padding behind it. + * + * *That* lets us be sure to madvise() away the write instruction, which + * will then fault, which makes sure that the fault code handles + * execute-only memory properly. + */ +__attribute__((__aligned__(PAGE_SIZE))) +void lots_o_noops_around_write(int *write_to_me) +{ + dprintf3("running %s()\n", __func__); + __page_o_noops(); + /* Assume this happens in the second page of instructions: */ + *write_to_me = __LINE__; + /* pad out by another page: */ + __page_o_noops(); + dprintf3("%s() done\n", __func__); +} + +/* Define some kernel-like types */ +#define u8 uint8_t +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; + + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); + } +} + +/* Failed address bound checks: */ +#ifndef SEGV_BNDERR +# define SEGV_BNDERR 3 +#endif + +#ifndef SEGV_PKUERR +# define SEGV_PKUERR 4 +#endif + +static char *si_code_str(int si_code) +{ + if (si_code == SEGV_MAPERR) + return "SEGV_MAPERR"; + if (si_code == SEGV_ACCERR) + return "SEGV_ACCERR"; + if (si_code == SEGV_BNDERR) + return "SEGV_BNDERR"; + if (si_code == SEGV_PKUERR) + return "SEGV_PKUERR"; + return "UNKNOWN"; +} + +int pkru_faults; +int last_si_pkey = -1; +void signal_handler(int signum, siginfo_t *si, void *vucontext) +{ + ucontext_t *uctxt = vucontext; + int trapno; + unsigned long ip; + char *fpregs; + u32 *pkru_ptr; + u64 siginfo_pkey; + u32 *si_pkey_ptr; + int pkru_offset; + fpregset_t fpregset; + + dprint_in_signal = 1; + dprintf1(">>>>===============SIGSEGV============================\n"); + dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, + __rdpkru(), shadow_pkru); + + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + fpregset = uctxt->uc_mcontext.fpregs; + fpregs = (void *)fpregset; + + dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, + trapno, ip, si_code_str(si->si_code), si->si_code); +#ifdef __i386__ + /* + * 32-bit has some extra padding so that userspace can tell whether + * the XSTATE header is present in addition to the "legacy" FPU + * state. We just assume that it is here. + */ + fpregs += 0x70; +#endif + pkru_offset = pkru_xstate_offset(); + pkru_ptr = (void *)(&fpregs[pkru_offset]); + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); + /* + * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * here. + */ + dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + if (DEBUG_LEVEL > 4) + dump_mem(pkru_ptr - 128, 256); + pkey_assert(*pkru_ptr); + + if ((si->si_code == SEGV_MAPERR) || + (si->si_code == SEGV_ACCERR) || + (si->si_code == SEGV_BNDERR)) { + printf("non-PK si_code, exiting...\n"); + exit(4); + } + + si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); + dump_mem((u8 *)si_pkey_ptr - 8, 24); + siginfo_pkey = *si_pkey_ptr; + pkey_assert(siginfo_pkey < NR_PKEYS); + last_si_pkey = siginfo_pkey; + + dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); + /* need __rdpkru() version so we do not do shadow_pkru checking */ + dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); + dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); + *(u64 *)pkru_ptr = 0x00000000; + dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); + pkru_faults++; + dprintf1("<<<<==================================================\n"); + dprint_in_signal = 0; +} + +int wait_all_children(void) +{ + int status; + return waitpid(-1, &status, 0); +} + +void sig_chld(int x) +{ + dprint_in_signal = 1; + dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); + dprint_in_signal = 0; +} + +void setup_sigsegv_handler(void) +{ + int r, rs; + struct sigaction newact; + struct sigaction oldact; + + /* #PF is mapped to sigsegv */ + int signum = SIGSEGV; + + newact.sa_handler = 0; + newact.sa_sigaction = signal_handler; + + /*sigset_t - signals to block while in the handler */ + /* get the old signal mask. */ + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); + pkey_assert(rs == 0); + + /* call sa_sigaction, not sa_handler*/ + newact.sa_flags = SA_SIGINFO; + + newact.sa_restorer = 0; /* void(*)(), obsolete */ + r = sigaction(signum, &newact, &oldact); + r = sigaction(SIGALRM, &newact, &oldact); + pkey_assert(r == 0); +} + +void setup_handlers(void) +{ + signal(SIGCHLD, &sig_chld); + setup_sigsegv_handler(); +} + +pid_t fork_lazy_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + while (1) { + dprintf1("child sleeping...\n"); + sleep(30); + } + } + return forkret; +} + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u32 pkru = __rdpkru(); + u32 shifted_pkru; + u32 masked_pkru; + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkru: %x\n", __func__, pkru); + + shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); + dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); + masked_pkru = shifted_pkru & mask; + dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other high bits. + */ + return masked_pkru; +} + +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u32 old_pkru = __rdpkru(); + u32 new_pkru; + + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); + + /* copy old pkru */ + new_pkru = old_pkru; + /* mask out bits from pkey in old value: */ + new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); + /* OR in new bits for pkey: */ + new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); + + __wrpkru(new_pkru); + + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", + __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); + return 0; +} + +void pkey_disable_set(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u32 orig_pkru = rdpkru(); + + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /*pkru and flags have the same format */ + shadow_pkru |= flags << (pkey * 2); + dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); + if (flags) + pkey_assert(rdpkru() > orig_pkru); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u32 orig_pkru = rdpkru(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + /* pkru and flags have the same format */ + shadow_pkru &= ~(flags << (pkey * 2)); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); + if (flags) + assert(rdpkru() > orig_pkru); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int alloc_pkey(void) +{ + int ret; + unsigned long init_val = 0x0; + + dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", + __LINE__, __rdpkru(), shadow_pkru); + ret = sys_pkey_alloc(0, init_val); + /* + * pkey_alloc() sets PKRU, so we need to reflect it in + * shadow_pkru: + */ + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + if (ret) { + /* clear both the bits: */ + shadow_pkru &= ~(0x3 << (ret * 2)); + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + /* + * move the new state in from init_val + * (remember, we cheated and init_val == pkru format) + */ + shadow_pkru |= (init_val << (ret * 2)); + } + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + /* for shadow checking: */ + rdpkru(); + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", + __LINE__, ret, __rdpkru(), shadow_pkru); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +/* + * I had a bug where pkey bits could be set by mprotect() but + * not cleared. This ensures we get lots of random bit sets + * and clears on the vma and pte pkey bits. + */ +int alloc_random_pkey(void) +{ + int max_nr_pkey_allocs; + int ret; + int i; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + int random_index; + memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + + /* allocate every possible key and make a note of which ones we got */ + max_nr_pkey_allocs = NR_PKEYS; + max_nr_pkey_allocs = 1; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + + pkey_assert(nr_alloced > 0); + /* select a random one out of the allocated ones */ + random_index = rand() % nr_alloced; + ret = alloced_pkeys[random_index]; + /* now zero it out so we don't free it next */ + alloced_pkeys[random_index] = 0; + + /* go through the allocated ones that we did not want and free them */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + return ret; +} + +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int nr_iterations = random() % 100; + int ret; + + while (0) { + int rpkey = alloc_random_pkey(); + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + if (nr_iterations-- < 0) + break; + + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + sys_pkey_free(rpkey); + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + } + pkey_assert(pkey < NR_PKEYS); + + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + pkey_assert(!ret); + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __rdpkru(), shadow_pkru); + return ret; +} + +struct pkey_malloc_record { + void *ptr; + long size; + int prot; +}; +struct pkey_malloc_record *pkey_malloc_records; +struct pkey_malloc_record *pkey_last_malloc_record; +long nr_pkey_malloc_records; +void record_pkey_malloc(void *ptr, long size, int prot) +{ + long i; + struct pkey_malloc_record *rec = NULL; + + for (i = 0; i < nr_pkey_malloc_records; i++) { + rec = &pkey_malloc_records[i]; + /* find a free record */ + if (rec) + break; + } + if (!rec) { + /* every record is full */ + size_t old_nr_records = nr_pkey_malloc_records; + size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); + size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); + dprintf2("new_nr_records: %zd\n", new_nr_records); + dprintf2("new_size: %zd\n", new_size); + pkey_malloc_records = realloc(pkey_malloc_records, new_size); + pkey_assert(pkey_malloc_records != NULL); + rec = &pkey_malloc_records[nr_pkey_malloc_records]; + /* + * realloc() does not initialize memory, so zero it from + * the first new record all the way to the end. + */ + for (i = 0; i < new_nr_records - old_nr_records; i++) + memset(rec + i, 0, sizeof(*rec)); + } + dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", + (int)(rec - pkey_malloc_records), rec, ptr, size); + rec->ptr = ptr; + rec->size = size; + rec->prot = prot; + pkey_last_malloc_record = rec; + nr_pkey_malloc_records++; +} + +void free_pkey_malloc(void *ptr) +{ + long i; + int ret; + dprintf3("%s(%p)\n", __func__, ptr); + for (i = 0; i < nr_pkey_malloc_records; i++) { + struct pkey_malloc_record *rec = &pkey_malloc_records[i]; + dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + if ((ptr < rec->ptr) || + (ptr >= rec->ptr + rec->size)) + continue; + + dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + nr_pkey_malloc_records--; + ret = munmap(rec->ptr, rec->size); + dprintf3("munmap ret: %d\n", ret); + pkey_assert(!ret); + dprintf3("clearing rec->ptr, rec: %p\n", rec); + rec->ptr = NULL; + dprintf3("done clearing rec->ptr, rec: %p\n", rec); + return; + } + pkey_assert(false); +} + + +void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + rdpkru(); + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + rdpkru(); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +{ + int ret; + void *ptr; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + /* + * Guarantee we can fit at least one huge page in the resulting + * allocation by allocating space for 2: + */ + size = ALIGN_UP(size, HPAGE_SIZE * 2); + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + record_pkey_malloc(ptr, size, prot); + mprotect_pkey(ptr, size, prot, pkey); + + dprintf1("unaligned ptr: %p\n", ptr); + ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); + dprintf1(" aligned ptr: %p\n", ptr); + ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); + dprintf1("MADV_HUGEPAGE ret: %d\n", ret); + ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); + dprintf1("MADV_WILLNEED ret: %d\n", ret); + memset(ptr, 0, HPAGE_SIZE); + + dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +int hugetlb_setup_ok; +#define GET_NR_HUGE_PAGES 10 +void setup_hugetlbfs(void) +{ + int err; + int fd; + char buf[] = "123"; + + if (geteuid() != 0) { + fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); + return; + } + + cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); + + /* + * Now go make sure that we got the pages and that they + * are 2M pages. Someone might have made 1G the default. + */ + fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + if (fd < 0) { + perror("opening sysfs 2M hugetlb config"); + return; + } + + /* -1 to guarantee leaving the trailing \0 */ + err = read(fd, buf, sizeof(buf)-1); + close(fd); + if (err <= 0) { + perror("reading sysfs 2M hugetlb config"); + return; + } + + if (atoi(buf) != GET_NR_HUGE_PAGES) { + fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", + buf, GET_NR_HUGE_PAGES); + return; + } + + hugetlb_setup_ok = 1; +} + +void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +{ + void *ptr; + int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; + + if (!hugetlb_setup_ok) + return PTR_ERR_ENOTSUP; + + dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); + size = ALIGN_UP(size, HPAGE_SIZE * 2); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + pkey_assert(ptr != (void *)-1); + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) +{ + void *ptr; + int fd; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + fd = open("/dax/foo", O_RDWR); + pkey_assert(fd >= 0); + + ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); + pkey_assert(ptr != (void *)-1); + + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); + close(fd); + return ptr; +} + +void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { + + malloc_pkey_with_mprotect, + malloc_pkey_anon_huge, + malloc_pkey_hugetlb +/* can not do direct with the pkey_mprotect() API: + malloc_pkey_mmap_direct, + malloc_pkey_mmap_dax, +*/ +}; + +void *malloc_pkey(long size, int prot, u16 pkey) +{ + void *ret; + static int malloc_type; + int nr_malloc_types = ARRAY_SIZE(pkey_malloc); + + pkey_assert(pkey < NR_PKEYS); + + while (1) { + pkey_assert(malloc_type < nr_malloc_types); + + ret = pkey_malloc[malloc_type](size, prot, pkey); + pkey_assert(ret != (void *)-1); + + malloc_type++; + if (malloc_type >= nr_malloc_types) + malloc_type = (random()%nr_malloc_types); + + /* try again if the malloc_type we tried is unsupported */ + if (ret == PTR_ERR_ENOTSUP) + continue; + + break; + } + + dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, + size, prot, pkey, ret); + return ret; +} + +int last_pkru_faults; +#define UNKNOWN_PKEY -2 +void expected_pk_fault(int pkey) +{ + dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", + __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); + pkey_assert(last_pkru_faults + 1 == pkru_faults); + + /* + * For exec-only memory, we do not know the pkey in + * advance, so skip this check. + */ + if (pkey != UNKNOWN_PKEY) + pkey_assert(last_si_pkey == pkey); + + /* + * The signal handler shold have cleared out PKRU to let the + * test program continue. We now have to restore it. + */ + if (__rdpkru() != 0) + pkey_assert(0); + + __wrpkru(shadow_pkru); + dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", + __func__, shadow_pkru); + last_pkru_faults = pkru_faults; + last_si_pkey = -1; +} + +#define do_not_expect_pk_fault(msg) do { \ + if (last_pkru_faults != pkru_faults) \ + dprintf0("unexpected PK fault: %s\n", msg); \ + pkey_assert(last_pkru_faults == pkru_faults); \ +} while (0) + +int test_fds[10] = { -1 }; +int nr_test_fds; +void __save_test_fd(int fd) +{ + pkey_assert(fd >= 0); + pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + test_fds[nr_test_fds] = fd; + nr_test_fds++; +} + +int get_test_read_fd(void) +{ + int test_fd = open("/etc/passwd", O_RDONLY); + __save_test_fd(test_fd); + return test_fd; +} + +void close_test_fds(void) +{ + int i; + + for (i = 0; i < nr_test_fds; i++) { + if (test_fds[i] < 0) + continue; + close(test_fds[i]); + test_fds[i] = -1; + } + nr_test_fds = 0; +} + +#define barrier() __asm__ __volatile__("": : :"memory") +__attribute__((noinline)) int read_ptr(int *ptr) +{ + /* + * Keep GCC from optimizing this away somehow + */ + barrier(); + return *ptr; +} + +void test_read_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling write access to PKEY[1], doing read\n"); + pkey_write_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + dprintf1("\n"); +} +void test_read_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); + rdpkru(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pk_fault(pkey); +} +void test_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pk_fault(pkey); +} +void test_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pk_fault(pkey); +} +void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + dprintf1("disabling access to PKEY[%02d], " + "having kernel read() to buffer\n", pkey); + pkey_access_deny(pkey); + ret = read(test_fd, ptr, 1); + dprintf1("read ret: %d\n", ret); + pkey_assert(ret); +} +void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + pkey_write_deny(pkey); + ret = read(test_fd, ptr, 100); + dprintf1("read ret: %d\n", ret); + if (ret < 0 && (DEBUG_LEVEL > 0)) + perror("verbose read result (OK for this to be bad)"); + pkey_assert(ret); +} + +void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +{ + int pipe_ret, vmsplice_ret; + struct iovec iov; + int pipe_fds[2]; + + pipe_ret = pipe(pipe_fds); + + pkey_assert(pipe_ret == 0); + dprintf1("disabling access to PKEY[%02d], " + "having kernel vmsplice from buffer\n", pkey); + pkey_access_deny(pkey); + iov.iov_base = ptr; + iov.iov_len = PAGE_SIZE; + vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); + dprintf1("vmsplice() ret: %d\n", vmsplice_ret); + pkey_assert(vmsplice_ret == -1); + + close(pipe_fds[0]); + close(pipe_fds[1]); +} + +void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +{ + int ignored = 0xdada; + int futex_ret; + int some_int = __LINE__; + + dprintf1("disabling write to PKEY[%02d], " + "doing futex gunk in buffer\n", pkey); + *ptr = some_int; + pkey_write_deny(pkey); + futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, + &ignored, ignored); + if (DEBUG_LEVEL > 0) + perror("futex"); + dprintf1("futex() ret: %d\n", futex_ret); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +{ + int err; + int i; + + /* Note: 0 is the default pkey, so don't mess with it */ + for (i = 1; i < NR_PKEYS; i++) { + if (pkey == i) + continue; + + dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); + pkey_assert(err); + } +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +{ + int err; + int bad_pkey = NR_PKEYS+99; + + /* pass a known-invalid pkey in: */ + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); + pkey_assert(err); +} + +void become_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + return; + } + exit(0); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS*3; i++) { + int new_pkey; + dprintf1("%s() alloc loop: %d\n", __func__, i); + new_pkey = alloc_pkey(); + dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, + __LINE__, err, __rdpkru(), shadow_pkru); + rdpkru(); /* for shadow checking */ + dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); + if ((new_pkey == -1) && (errno == ENOSPC)) { + dprintf2("%s() failed to allocate pkey after %d tries\n", + __func__, nr_allocated_pkeys); + } else { + /* + * Ensure the number of successes never + * exceeds the number of keys supported + * in the hardware. + */ + pkey_assert(nr_allocated_pkeys < NR_PKEYS); + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + /* + * Make sure that allocation state is properly + * preserved across fork(). + */ + if (i == NR_PKEYS*2) + become_child(); + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + /* + * There are 16 pkeys supported in hardware. Three are + * allocated by the time we get here: + * 1. The default key (0) + * 2. One possibly consumed by an execute-only mapping. + * 3. One allocated by the test code and passed in via + * 'pkey' to this function. + * Ensure that we can allocate at least another 13 (16-3). + */ + pkey_assert(i >= NR_PKEYS-3); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + rdpkru(); /* for shadow checking */ + } +} + +/* + * pkey 0 is special. It is allocated by default, so you do not + * have to call pkey_alloc() to use it first. Make sure that it + * is usable. + */ +void test_mprotect_with_pkey_0(int *ptr, u16 pkey) +{ + long size; + int prot; + + assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + prot = pkey_last_malloc_record->prot; + + /* Use pkey 0 */ + mprotect_pkey(ptr, size, prot, 0); + + /* Make sure that we can set it back to the original pkey. */ + mprotect_pkey(ptr, size, prot, pkey); +} + +void test_ptrace_of_child(int *ptr, u16 pkey) +{ + __attribute__((__unused__)) int peek_result; + pid_t child_pid; + void *ignored = 0; + long ret; + int status; + /* + * This is the "control" for our little expermient. Make sure + * we can always access it when ptracing. + */ + int *plain_ptr_unaligned = malloc(HPAGE_SIZE); + int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); + + /* + * Fork a child which is an exact copy of this process, of course. + * That means we can do all of our tests via ptrace() and then plain + * memory access and ensure they work differently. + */ + child_pid = fork_lazy_child(); + dprintf1("[%d] child pid: %d\n", getpid(), child_pid); + + ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); + if (ret) + perror("attach"); + dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); + pkey_assert(ret != -1); + ret = waitpid(child_pid, &status, WUNTRACED); + if ((ret != child_pid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitpid result %ld stat %x\n", + ret, status); + pkey_assert(0); + } + dprintf2("waitpid ret: %ld\n", ret); + dprintf2("waitpid status: %d\n", status); + + pkey_access_deny(pkey); + pkey_write_deny(pkey); + + /* Write access, untested for now: + ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); + pkey_assert(ret != -1); + dprintf1("poke at %p: %ld\n", peek_at, ret); + */ + + /* + * Try to access the pkey-protected "ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect an exception: */ + peek_result = read_ptr(ptr); + expected_pk_fault(pkey); + + /* + * Try to access the NON-pkey-protected "plain_ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect NO exception: */ + peek_result = read_ptr(plain_ptr); + do_not_expect_pk_fault("read plain pointer after ptrace"); + + ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); + pkey_assert(ret != -1); + + ret = kill(child_pid, SIGKILL); + pkey_assert(ret != -1); + + wait(&status); + + free(plain_ptr_unaligned); +} + +void *get_pointer_to_instructions(void) +{ + void *p1; + + p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); + dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); + /* lots_o_noops_around_write should be page-aligned already */ + assert(p1 == &lots_o_noops_around_write); + + /* Point 'p1' at the *second* page of the function: */ + p1 += PAGE_SIZE; + + /* + * Try to ensure we fault this in on next touch to ensure + * we get an instruction fault as opposed to a data one + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + + return p1; +} + +void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); + pkey_assert(!ret); + pkey_access_deny(pkey); + + dprintf2("pkru: %x\n", rdpkru()); + + /* + * Make sure this is an *instruction* fault + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pk_fault("executing on PROT_EXEC memory"); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pk_fault(pkey); +} + +void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + dprintf1("%s() start\n", __func__); + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + /* Use a *normal* mprotect(), not mprotect_pkey(): */ + ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); + pkey_assert(!ret); + + dprintf2("pkru: %x\n", rdpkru()); + + /* Make sure this is an *instruction* fault */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pk_fault("executing on PROT_EXEC memory"); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pk_fault(UNKNOWN_PKEY); + + /* + * Put the memory back to non-PROT_EXEC. Should clear the + * exec-only pkey off the VMA and allow it to be readable + * again. Go to PROT_NONE first to check for a kernel bug + * that did not clear the pkey when doing PROT_NONE. + */ + ret = mprotect(p1, PAGE_SIZE, PROT_NONE); + pkey_assert(!ret); + + ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); + pkey_assert(!ret); + ptr_contents = read_ptr(p1); + do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); +} + +void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +{ + int size = PAGE_SIZE; + int sret; + + if (cpu_has_pku()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return; + } + + sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); + pkey_assert(sret < 0); +} + +void (*pkey_tests[])(int *ptr, u16 pkey) = { + test_read_of_write_disabled_region, + test_read_of_access_disabled_region, + test_write_of_write_disabled_region, + test_write_of_access_disabled_region, + test_kernel_write_of_access_disabled_region, + test_kernel_write_of_write_disabled_region, + test_kernel_gup_of_access_disabled_region, + test_kernel_gup_write_to_write_disabled_region, + test_executing_on_unreadable_memory, + test_implicit_mprotect_exec_only_memory, + test_mprotect_with_pkey_0, + test_ptrace_of_child, + test_pkey_syscalls_on_non_allocated_pkey, + test_pkey_syscalls_bad_args, + test_pkey_alloc_exhaust, +}; + +void run_tests_once(void) +{ + int *ptr; + int prot = PROT_READ|PROT_WRITE; + + for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + int pkey; + int orig_pkru_faults = pkru_faults; + + dprintf1("======================\n"); + dprintf1("test %d preparing...\n", test_nr); + + tracing_on(); + pkey = alloc_random_pkey(); + dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); + ptr = malloc_pkey(PAGE_SIZE, prot, pkey); + dprintf1("test %d starting...\n", test_nr); + pkey_tests[test_nr](ptr, pkey); + dprintf1("freeing test memory: %p\n", ptr); + free_pkey_malloc(ptr); + sys_pkey_free(pkey); + + dprintf1("pkru_faults: %d\n", pkru_faults); + dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + + tracing_off(); + close_test_fds(); + + printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); + dprintf1("======================\n\n"); + } + iteration_nr++; +} + +void pkey_setup_shadow(void) +{ + shadow_pkru = __rdpkru(); +} + +int main(void) +{ + int nr_iterations = 22; + + setup_handlers(); + + printf("has pku: %d\n", cpu_has_pku()); + + if (!cpu_has_pku()) { + int size = PAGE_SIZE; + int *ptr; + + printf("running PKEY tests for unsupported CPU/OS\n"); + + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + assert(ptr != (void *)-1); + test_mprotect_pkey_on_unsupported_cpu(ptr, 1); + exit(0); + } + + pkey_setup_shadow(); + printf("startup pkru: %x\n", rdpkru()); + setup_hugetlbfs(); + + while (nr_iterations-- > 0) + run_tests_once(); + + printf("done (all tests OK)\n"); + return 0; +} diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 022a1f3b64ef..1aaef5bf119a 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -12,5 +12,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5d49bfec1e9a..5f16821c7f63 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap \ + test_vdso test_vsyscall mov_ss_trap \ syscall_arg_fault TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h deleted file mode 100644 index 254e5436bdd9..000000000000 --- a/tools/testing/selftests/x86/pkey-helpers.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkru; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkru = eax; - return pkru; -} - -static inline unsigned int _rdpkru(int line) -{ - unsigned int pkru = __rdpkru(); - - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); - - return pkru; -} - -#define rdpkru() _rdpkru(__LINE__) - -static inline void __wrpkru(unsigned int pkru) -{ - unsigned int eax = pkru; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); -} - -static inline void wrpkru(unsigned int pkru) -{ - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); -} - -/* - * These are technically racy. since something could - * change PKRU between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - unsigned int pkru = rdpkru(); - int bit = pkey * 2; - - if (do_allow) - pkru &= (1<mmap (see exit_mmap()), so make sure it is immune to pkeys - * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel - * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks - * - * Compile like this: - * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "pkey-helpers.h" - -int iteration_nr = 1; -int test_nr; - -unsigned int shadow_pkru; - -#define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - -int dprint_in_signal; -char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - -void cat_into_file(char *str, char *file) -{ - int fd = open(file, O_RDWR); - int ret; - - dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); - /* - * these need to be raw because they are called under - * pkey_assert() - */ - if (fd < 0) { - fprintf(stderr, "error opening '%s'\n", str); - perror("error: "); - exit(__LINE__); - } - - ret = write(fd, str, strlen(str)); - if (ret != strlen(str)) { - perror("write to file failed"); - fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); - exit(__LINE__); - } - close(fd); -} - -#if CONTROL_TRACING > 0 -static int warned_tracing; -int tracing_root_ok(void) -{ - if (geteuid() != 0) { - if (!warned_tracing) - fprintf(stderr, "WARNING: not run as root, " - "can not do tracing control\n"); - warned_tracing = 1; - return 0; - } - return 1; -} -#endif - -void tracing_on(void) -{ -#if CONTROL_TRACING > 0 -#define TRACEDIR "/sys/kernel/debug/tracing" - char pidstr[32]; - - if (!tracing_root_ok()) - return; - - sprintf(pidstr, "%d", getpid()); - cat_into_file("0", TRACEDIR "/tracing_on"); - cat_into_file("\n", TRACEDIR "/trace"); - if (1) { - cat_into_file("function_graph", TRACEDIR "/current_tracer"); - cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); - } else { - cat_into_file("nop", TRACEDIR "/current_tracer"); - } - cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); - cat_into_file("1", TRACEDIR "/tracing_on"); - dprintf1("enabled tracing\n"); -#endif -} - -void tracing_off(void) -{ -#if CONTROL_TRACING > 0 - if (!tracing_root_ok()) - return; - cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); -#endif -} - -void abort_hooks(void) -{ - fprintf(stderr, "running %s()...\n", __func__); - tracing_off(); -#ifdef SLEEP_ON_ABORT - sleep(SLEEP_ON_ABORT); -#endif -} - -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - -/* - * This attempts to have roughly a page of instructions followed by a few - * instructions that do a write, and another page of instructions. That - * way, we are pretty sure that the write is in the second page of - * instructions and has at least a page of padding behind it. - * - * *That* lets us be sure to madvise() away the write instruction, which - * will then fault, which makes sure that the fault code handles - * execute-only memory properly. - */ -__attribute__((__aligned__(PAGE_SIZE))) -void lots_o_noops_around_write(int *write_to_me) -{ - dprintf3("running %s()\n", __func__); - __page_o_noops(); - /* Assume this happens in the second page of instructions: */ - *write_to_me = __LINE__; - /* pad out by another page: */ - __page_o_noops(); - dprintf3("%s() done\n", __func__); -} - -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t - -#ifdef __i386__ - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif - -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 - -#else - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif - -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 - -#endif - -void dump_mem(void *dumpme, int len_bytes) -{ - char *c = (void *)dumpme; - int i; - - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); - } -} - -/* Failed address bound checks: */ -#ifndef SEGV_BNDERR -# define SEGV_BNDERR 3 -#endif - -#ifndef SEGV_PKUERR -# define SEGV_PKUERR 4 -#endif - -static char *si_code_str(int si_code) -{ - if (si_code == SEGV_MAPERR) - return "SEGV_MAPERR"; - if (si_code == SEGV_ACCERR) - return "SEGV_ACCERR"; - if (si_code == SEGV_BNDERR) - return "SEGV_BNDERR"; - if (si_code == SEGV_PKUERR) - return "SEGV_PKUERR"; - return "UNKNOWN"; -} - -int pkru_faults; -int last_si_pkey = -1; -void signal_handler(int signum, siginfo_t *si, void *vucontext) -{ - ucontext_t *uctxt = vucontext; - int trapno; - unsigned long ip; - char *fpregs; - u32 *pkru_ptr; - u64 siginfo_pkey; - u32 *si_pkey_ptr; - int pkru_offset; - fpregset_t fpregset; - - dprint_in_signal = 1; - dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); - - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; - - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); -#ifdef __i386__ - /* - * 32-bit has some extra padding so that userspace can tell whether - * the XSTATE header is present in addition to the "legacy" FPU - * state. We just assume that it is here. - */ - fpregs += 0x70; -#endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); - - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); - /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in - * here. - */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); - if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); - - if ((si->si_code == SEGV_MAPERR) || - (si->si_code == SEGV_ACCERR) || - (si->si_code == SEGV_BNDERR)) { - printf("non-PK si_code, exiting...\n"); - exit(4); - } - - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); - dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); - dump_mem((u8 *)si_pkey_ptr - 8, 24); - siginfo_pkey = *si_pkey_ptr; - pkey_assert(siginfo_pkey < NR_PKEYS); - last_si_pkey = siginfo_pkey; - - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; - dprintf1("<<<<==================================================\n"); - dprint_in_signal = 0; -} - -int wait_all_children(void) -{ - int status; - return waitpid(-1, &status, 0); -} - -void sig_chld(int x) -{ - dprint_in_signal = 1; - dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); - dprint_in_signal = 0; -} - -void setup_sigsegv_handler(void) -{ - int r, rs; - struct sigaction newact; - struct sigaction oldact; - - /* #PF is mapped to sigsegv */ - int signum = SIGSEGV; - - newact.sa_handler = 0; - newact.sa_sigaction = signal_handler; - - /*sigset_t - signals to block while in the handler */ - /* get the old signal mask. */ - rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); - pkey_assert(rs == 0); - - /* call sa_sigaction, not sa_handler*/ - newact.sa_flags = SA_SIGINFO; - - newact.sa_restorer = 0; /* void(*)(), obsolete */ - r = sigaction(signum, &newact, &oldact); - r = sigaction(SIGALRM, &newact, &oldact); - pkey_assert(r == 0); -} - -void setup_handlers(void) -{ - signal(SIGCHLD, &sig_chld); - setup_sigsegv_handler(); -} - -pid_t fork_lazy_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - while (1) { - dprintf1("child sleeping...\n"); - sleep(30); - } - } - return forkret; -} - -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); - - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkru; -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* copy old pkru */ - new_pkru = old_pkru; - /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); - - __wrpkru(new_pkru); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u32 orig_pkru = rdpkru(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - pkey_assert(rdpkru() > orig_pkru); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - assert(rdpkru() > orig_pkru); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - -int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int sret; - - dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, - ptr, size, orig_prot, pkey); - - errno = 0; - sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); - if (errno) { - dprintf2("SYS_mprotect_key sret: %d\n", sret); - dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); - dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) - perror("SYS_mprotect_pkey"); - } - return sret; -} - -int sys_pkey_alloc(unsigned long flags, unsigned long init_val) -{ - int ret = syscall(SYS_pkey_alloc, flags, init_val); - dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", - __func__, flags, init_val, ret, errno); - return ret; -} - -int alloc_pkey(void) -{ - int ret; - unsigned long init_val = 0x0; - - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); - ret = sys_pkey_alloc(0, init_val); - /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: - */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - if (ret) { - /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - /* - * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) - */ - shadow_pkru |= (init_val << (ret * 2)); - } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); - /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - return ret; -} - -int sys_pkey_free(unsigned long pkey) -{ - int ret = syscall(SYS_pkey_free, pkey); - dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); - return ret; -} - -/* - * I had a bug where pkey bits could be set by mprotect() but - * not cleared. This ensures we get lots of random bit sets - * and clears on the vma and pte pkey bits. - */ -int alloc_random_pkey(void) -{ - int max_nr_pkey_allocs; - int ret; - int i; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - int random_index; - memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); - - /* allocate every possible key and make a note of which ones we got */ - max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - - pkey_assert(nr_alloced > 0); - /* select a random one out of the allocated ones */ - random_index = rand() % nr_alloced; - ret = alloced_pkeys[random_index]; - /* now zero it out so we don't free it next */ - alloced_pkeys[random_index] = 0; - - /* go through the allocated ones that we did not want and free them */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); - return ret; -} - -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int nr_iterations = random() % 100; - int ret; - - while (0) { - int rpkey = alloc_random_pkey(); - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - if (nr_iterations-- < 0) - break; - - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); - sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); - } - pkey_assert(pkey < NR_PKEYS); - - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); - return ret; -} - -struct pkey_malloc_record { - void *ptr; - long size; - int prot; -}; -struct pkey_malloc_record *pkey_malloc_records; -struct pkey_malloc_record *pkey_last_malloc_record; -long nr_pkey_malloc_records; -void record_pkey_malloc(void *ptr, long size, int prot) -{ - long i; - struct pkey_malloc_record *rec = NULL; - - for (i = 0; i < nr_pkey_malloc_records; i++) { - rec = &pkey_malloc_records[i]; - /* find a free record */ - if (rec) - break; - } - if (!rec) { - /* every record is full */ - size_t old_nr_records = nr_pkey_malloc_records; - size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); - size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); - dprintf2("new_nr_records: %zd\n", new_nr_records); - dprintf2("new_size: %zd\n", new_size); - pkey_malloc_records = realloc(pkey_malloc_records, new_size); - pkey_assert(pkey_malloc_records != NULL); - rec = &pkey_malloc_records[nr_pkey_malloc_records]; - /* - * realloc() does not initialize memory, so zero it from - * the first new record all the way to the end. - */ - for (i = 0; i < new_nr_records - old_nr_records; i++) - memset(rec + i, 0, sizeof(*rec)); - } - dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", - (int)(rec - pkey_malloc_records), rec, ptr, size); - rec->ptr = ptr; - rec->size = size; - rec->prot = prot; - pkey_last_malloc_record = rec; - nr_pkey_malloc_records++; -} - -void free_pkey_malloc(void *ptr) -{ - long i; - int ret; - dprintf3("%s(%p)\n", __func__, ptr); - for (i = 0; i < nr_pkey_malloc_records; i++) { - struct pkey_malloc_record *rec = &pkey_malloc_records[i]; - dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - if ((ptr < rec->ptr) || - (ptr >= rec->ptr + rec->size)) - continue; - - dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - nr_pkey_malloc_records--; - ret = munmap(rec->ptr, rec->size); - dprintf3("munmap ret: %d\n", ret); - pkey_assert(!ret); - dprintf3("clearing rec->ptr, rec: %p\n", rec); - rec->ptr = NULL; - dprintf3("done clearing rec->ptr, rec: %p\n", rec); - return; - } - pkey_assert(false); -} - - -void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - rdpkru(); - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - rdpkru(); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) -{ - int ret; - void *ptr; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - /* - * Guarantee we can fit at least one huge page in the resulting - * allocation by allocating space for 2: - */ - size = ALIGN_UP(size, HPAGE_SIZE * 2); - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - record_pkey_malloc(ptr, size, prot); - mprotect_pkey(ptr, size, prot, pkey); - - dprintf1("unaligned ptr: %p\n", ptr); - ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); - dprintf1(" aligned ptr: %p\n", ptr); - ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); - dprintf1("MADV_HUGEPAGE ret: %d\n", ret); - ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); - dprintf1("MADV_WILLNEED ret: %d\n", ret); - memset(ptr, 0, HPAGE_SIZE); - - dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -int hugetlb_setup_ok; -#define GET_NR_HUGE_PAGES 10 -void setup_hugetlbfs(void) -{ - int err; - int fd; - char buf[] = "123"; - - if (geteuid() != 0) { - fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); - return; - } - - cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); - - /* - * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. - */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); - if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); - return; - } - - /* -1 to guarantee leaving the trailing \0 */ - err = read(fd, buf, sizeof(buf)-1); - close(fd); - if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); - return; - } - - if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); - return; - } - - hugetlb_setup_ok = 1; -} - -void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) -{ - void *ptr; - int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; - - if (!hugetlb_setup_ok) - return PTR_ERR_ENOTSUP; - - dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); - size = ALIGN_UP(size, HPAGE_SIZE * 2); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - pkey_assert(ptr != (void *)-1); - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) -{ - void *ptr; - int fd; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - fd = open("/dax/foo", O_RDWR); - pkey_assert(fd >= 0); - - ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); - pkey_assert(ptr != (void *)-1); - - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); - close(fd); - return ptr; -} - -void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { - - malloc_pkey_with_mprotect, - malloc_pkey_anon_huge, - malloc_pkey_hugetlb -/* can not do direct with the pkey_mprotect() API: - malloc_pkey_mmap_direct, - malloc_pkey_mmap_dax, -*/ -}; - -void *malloc_pkey(long size, int prot, u16 pkey) -{ - void *ret; - static int malloc_type; - int nr_malloc_types = ARRAY_SIZE(pkey_malloc); - - pkey_assert(pkey < NR_PKEYS); - - while (1) { - pkey_assert(malloc_type < nr_malloc_types); - - ret = pkey_malloc[malloc_type](size, prot, pkey); - pkey_assert(ret != (void *)-1); - - malloc_type++; - if (malloc_type >= nr_malloc_types) - malloc_type = (random()%nr_malloc_types); - - /* try again if the malloc_type we tried is unsupported */ - if (ret == PTR_ERR_ENOTSUP) - continue; - - break; - } - - dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, - size, prot, pkey, ret); - return ret; -} - -int last_pkru_faults; -#define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) -{ - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); - dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); - - /* - * For exec-only memory, we do not know the pkey in - * advance, so skip this check. - */ - if (pkey != UNKNOWN_PKEY) - pkey_assert(last_si_pkey == pkey); - - /* - * The signal handler shold have cleared out PKRU to let the - * test program continue. We now have to restore it. - */ - if (__rdpkru() != 0) - pkey_assert(0); - - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; - last_si_pkey = -1; -} - -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ -} while (0) - -int test_fds[10] = { -1 }; -int nr_test_fds; -void __save_test_fd(int fd) -{ - pkey_assert(fd >= 0); - pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); - test_fds[nr_test_fds] = fd; - nr_test_fds++; -} - -int get_test_read_fd(void) -{ - int test_fd = open("/etc/passwd", O_RDONLY); - __save_test_fd(test_fd); - return test_fd; -} - -void close_test_fds(void) -{ - int i; - - for (i = 0; i < nr_test_fds; i++) { - if (test_fds[i] < 0) - continue; - close(test_fds[i]); - test_fds[i] = -1; - } - nr_test_fds = 0; -} - -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - -void test_read_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling write access to PKEY[1], doing read\n"); - pkey_write_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - dprintf1("\n"); -} -void test_read_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); -} -void test_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pk_fault(pkey); -} -void test_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pk_fault(pkey); -} -void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - dprintf1("disabling access to PKEY[%02d], " - "having kernel read() to buffer\n", pkey); - pkey_access_deny(pkey); - ret = read(test_fd, ptr, 1); - dprintf1("read ret: %d\n", ret); - pkey_assert(ret); -} -void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - pkey_write_deny(pkey); - ret = read(test_fd, ptr, 100); - dprintf1("read ret: %d\n", ret); - if (ret < 0 && (DEBUG_LEVEL > 0)) - perror("verbose read result (OK for this to be bad)"); - pkey_assert(ret); -} - -void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) -{ - int pipe_ret, vmsplice_ret; - struct iovec iov; - int pipe_fds[2]; - - pipe_ret = pipe(pipe_fds); - - pkey_assert(pipe_ret == 0); - dprintf1("disabling access to PKEY[%02d], " - "having kernel vmsplice from buffer\n", pkey); - pkey_access_deny(pkey); - iov.iov_base = ptr; - iov.iov_len = PAGE_SIZE; - vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); - dprintf1("vmsplice() ret: %d\n", vmsplice_ret); - pkey_assert(vmsplice_ret == -1); - - close(pipe_fds[0]); - close(pipe_fds[1]); -} - -void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) -{ - int ignored = 0xdada; - int futex_ret; - int some_int = __LINE__; - - dprintf1("disabling write to PKEY[%02d], " - "doing futex gunk in buffer\n", pkey); - *ptr = some_int; - pkey_write_deny(pkey); - futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, - &ignored, ignored); - if (DEBUG_LEVEL > 0) - perror("futex"); - dprintf1("futex() ret: %d\n", futex_ret); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) -{ - int err; - int i; - - /* Note: 0 is the default pkey, so don't mess with it */ - for (i = 1; i < NR_PKEYS; i++) { - if (pkey == i) - continue; - - dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); - pkey_assert(err); - } -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) -{ - int err; - int bad_pkey = NR_PKEYS+99; - - /* pass a known-invalid pkey in: */ - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); - pkey_assert(err); -} - -void become_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - return; - } - exit(0); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_alloc_exhaust(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS*3; i++) { - int new_pkey; - dprintf1("%s() alloc loop: %d\n", __func__, i); - new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ - dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); - if ((new_pkey == -1) && (errno == ENOSPC)) { - dprintf2("%s() failed to allocate pkey after %d tries\n", - __func__, nr_allocated_pkeys); - } else { - /* - * Ensure the number of successes never - * exceeds the number of keys supported - * in the hardware. - */ - pkey_assert(nr_allocated_pkeys < NR_PKEYS); - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - /* - * Make sure that allocation state is properly - * preserved across fork(). - */ - if (i == NR_PKEYS*2) - become_child(); - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - /* - * There are 16 pkeys supported in hardware. Three are - * allocated by the time we get here: - * 1. The default key (0) - * 2. One possibly consumed by an execute-only mapping. - * 3. One allocated by the test code and passed in via - * 'pkey' to this function. - * Ensure that we can allocate at least another 13 (16-3). - */ - pkey_assert(i >= NR_PKEYS-3); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - rdpkru(); /* for shadow checking */ - } -} - -/* - * pkey 0 is special. It is allocated by default, so you do not - * have to call pkey_alloc() to use it first. Make sure that it - * is usable. - */ -void test_mprotect_with_pkey_0(int *ptr, u16 pkey) -{ - long size; - int prot; - - assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - prot = pkey_last_malloc_record->prot; - - /* Use pkey 0 */ - mprotect_pkey(ptr, size, prot, 0); - - /* Make sure that we can set it back to the original pkey. */ - mprotect_pkey(ptr, size, prot, pkey); -} - -void test_ptrace_of_child(int *ptr, u16 pkey) -{ - __attribute__((__unused__)) int peek_result; - pid_t child_pid; - void *ignored = 0; - long ret; - int status; - /* - * This is the "control" for our little expermient. Make sure - * we can always access it when ptracing. - */ - int *plain_ptr_unaligned = malloc(HPAGE_SIZE); - int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); - - /* - * Fork a child which is an exact copy of this process, of course. - * That means we can do all of our tests via ptrace() and then plain - * memory access and ensure they work differently. - */ - child_pid = fork_lazy_child(); - dprintf1("[%d] child pid: %d\n", getpid(), child_pid); - - ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); - if (ret) - perror("attach"); - dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); - pkey_assert(ret != -1); - ret = waitpid(child_pid, &status, WUNTRACED); - if ((ret != child_pid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitpid result %ld stat %x\n", - ret, status); - pkey_assert(0); - } - dprintf2("waitpid ret: %ld\n", ret); - dprintf2("waitpid status: %d\n", status); - - pkey_access_deny(pkey); - pkey_write_deny(pkey); - - /* Write access, untested for now: - ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); - pkey_assert(ret != -1); - dprintf1("poke at %p: %ld\n", peek_at, ret); - */ - - /* - * Try to access the pkey-protected "ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect an exception: */ - peek_result = read_ptr(ptr); - expected_pk_fault(pkey); - - /* - * Try to access the NON-pkey-protected "plain_ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect NO exception: */ - peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); - - ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); - pkey_assert(ret != -1); - - ret = kill(child_pid, SIGKILL); - pkey_assert(ret != -1); - - wait(&status); - - free(plain_ptr_unaligned); -} - -void *get_pointer_to_instructions(void) -{ - void *p1; - - p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); - dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); - /* lots_o_noops_around_write should be page-aligned already */ - assert(p1 == &lots_o_noops_around_write); - - /* Point 'p1' at the *second* page of the function: */ - p1 += PAGE_SIZE; - - /* - * Try to ensure we fault this in on next touch to ensure - * we get an instruction fault as opposed to a data one - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - - return p1; -} - -void test_executing_on_unreadable_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); - pkey_assert(!ret); - pkey_access_deny(pkey); - - dprintf2("pkru: %x\n", rdpkru()); - - /* - * Make sure this is an *instruction* fault - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); -} - -void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - dprintf1("%s() start\n", __func__); - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - /* Use a *normal* mprotect(), not mprotect_pkey(): */ - ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); - pkey_assert(!ret); - - dprintf2("pkru: %x\n", rdpkru()); - - /* Make sure this is an *instruction* fault */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); - - /* - * Put the memory back to non-PROT_EXEC. Should clear the - * exec-only pkey off the VMA and allow it to be readable - * again. Go to PROT_NONE first to check for a kernel bug - * that did not clear the pkey when doing PROT_NONE. - */ - ret = mprotect(p1, PAGE_SIZE, PROT_NONE); - pkey_assert(!ret); - - ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); - pkey_assert(!ret); - ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); -} - -void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) -{ - int size = PAGE_SIZE; - int sret; - - if (cpu_has_pku()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return; - } - - sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); - pkey_assert(sret < 0); -} - -void (*pkey_tests[])(int *ptr, u16 pkey) = { - test_read_of_write_disabled_region, - test_read_of_access_disabled_region, - test_write_of_write_disabled_region, - test_write_of_access_disabled_region, - test_kernel_write_of_access_disabled_region, - test_kernel_write_of_write_disabled_region, - test_kernel_gup_of_access_disabled_region, - test_kernel_gup_write_to_write_disabled_region, - test_executing_on_unreadable_memory, - test_implicit_mprotect_exec_only_memory, - test_mprotect_with_pkey_0, - test_ptrace_of_child, - test_pkey_syscalls_on_non_allocated_pkey, - test_pkey_syscalls_bad_args, - test_pkey_alloc_exhaust, -}; - -void run_tests_once(void) -{ - int *ptr; - int prot = PROT_READ|PROT_WRITE; - - for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { - int pkey; - int orig_pkru_faults = pkru_faults; - - dprintf1("======================\n"); - dprintf1("test %d preparing...\n", test_nr); - - tracing_on(); - pkey = alloc_random_pkey(); - dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); - ptr = malloc_pkey(PAGE_SIZE, prot, pkey); - dprintf1("test %d starting...\n", test_nr); - pkey_tests[test_nr](ptr, pkey); - dprintf1("freeing test memory: %p\n", ptr); - free_pkey_malloc(ptr); - sys_pkey_free(pkey); - - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); - - tracing_off(); - close_test_fds(); - - printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); - dprintf1("======================\n\n"); - } - iteration_nr++; -} - -void pkey_setup_shadow(void) -{ - shadow_pkru = __rdpkru(); -} - -int main(void) -{ - int nr_iterations = 22; - - setup_handlers(); - - printf("has pku: %d\n", cpu_has_pku()); - - if (!cpu_has_pku()) { - int size = PAGE_SIZE; - int *ptr; - - printf("running PKEY tests for unsupported CPU/OS\n"); - - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - assert(ptr != (void *)-1); - test_mprotect_pkey_on_unsupported_cpu(ptr, 1); - exit(0); - } - - pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); - setup_hugetlbfs(); - - while (nr_iterations-- > 0) - run_tests_once(); - - printf("done (all tests OK)\n"); - return 0; -} -- cgit v1.2.3 From c4273c7f0ec34f7ab94332bf8279f19c04feca73 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:51:37 -0700 Subject: selftests/vm/pkeys: rename all references to pkru to a generic name This renames PKRU references to "pkey_reg" or "pkey" based on the usage. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Reviewed-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/2c6970bc6d2e99796cd5cc1101bd2ecf7eccb937.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 85 +++++----- tools/testing/selftests/vm/protection_keys.c | 240 ++++++++++++++------------- 2 files changed, 170 insertions(+), 155 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 254e5436bdd9..d5779be4793f 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -14,7 +14,7 @@ #include #define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 +#define PKEY_BITS_PER_PKEY 2 #ifndef DEBUG_LEVEL #define DEBUG_LEVEL 0 @@ -53,85 +53,88 @@ static inline void sigsafe_printf(const char *format, ...) #define dprintf3(args...) dprintf_level(3, args) #define dprintf4(args...) dprintf_level(4, args) -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) +extern unsigned int shadow_pkey_reg; +static inline unsigned int __read_pkey_reg(void) { unsigned int eax, edx; unsigned int ecx = 0; - unsigned int pkru; + unsigned int pkey_reg; asm volatile(".byte 0x0f,0x01,0xee\n\t" : "=a" (eax), "=d" (edx) : "c" (ecx)); - pkru = eax; - return pkru; + pkey_reg = eax; + return pkey_reg; } -static inline unsigned int _rdpkru(int line) +static inline unsigned int _read_pkey_reg(int line) { - unsigned int pkru = __rdpkru(); + unsigned int pkey_reg = __read_pkey_reg(); - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); + dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); - return pkru; + return pkey_reg; } -#define rdpkru() _rdpkru(__LINE__) +#define read_pkey_reg() _read_pkey_reg(__LINE__) -static inline void __wrpkru(unsigned int pkru) +static inline void __write_pkey_reg(unsigned int pkey_reg) { - unsigned int eax = pkru; + unsigned int eax = pkey_reg; unsigned int ecx = 0; unsigned int edx = 0; - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + dprintf4("%s() changing %08x to %08x\n", __func__, + __read_pkey_reg(), pkey_reg); asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); + assert(pkey_reg == __read_pkey_reg()); } -static inline void wrpkru(unsigned int pkru) +static inline void write_pkey_reg(unsigned int pkey_reg) { - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + dprintf4("%s() changing %08x to %08x\n", __func__, + __read_pkey_reg(), pkey_reg); /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%08x) pkey_reg: %08x\n", __func__, + pkey_reg, __read_pkey_reg()); } /* * These are technically racy. since something could - * change PKRU between the read and the write. + * change PKEY register between the read and the write. */ static inline void __pkey_access_allow(int pkey, int do_allow) { - unsigned int pkru = rdpkru(); + unsigned int pkey_reg = read_pkey_reg(); int bit = pkey * 2; if (do_allow) - pkru &= (1<>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; @@ -289,19 +289,19 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) */ fpregs += 0x70; #endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); dprintf1("siginfo: %p\n", si); dprintf1(" fpregs: %p\n", fpregs); /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -317,13 +317,16 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %08x\n", __read_pkey_reg()); dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); + pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } @@ -402,45 +405,47 @@ pid_t fork_lazy_child(void) static u32 hw_pkey_get(int pkey, unsigned long flags) { u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; + u32 pkey_reg = __read_pkey_reg(); + u32 shifted_pkey_reg; + u32 masked_pkey_reg; dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); + dprintf2("%s() raw pkey_reg: %x\n", __func__, pkey_reg); - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); + shifted_pkey_reg = (pkey_reg >> (pkey * PKEY_BITS_PER_PKEY)); + dprintf2("%s() shifted_pkey_reg: %x\n", __func__, shifted_pkey_reg); + masked_pkey_reg = shifted_pkey_reg & mask; + dprintf2("%s() masked pkey_reg: %x\n", __func__, masked_pkey_reg); /* * shift down the relevant bits to the lowest two, then * mask off all the other high bits. */ - return masked_pkru; + return masked_pkey_reg; } static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) { u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; + u32 old_pkey_reg = __read_pkey_reg(); + u32 new_pkey_reg; /* make sure that 'rights' only contains the bits we expect: */ assert(!(rights & ~mask)); - /* copy old pkru */ - new_pkru = old_pkru; + /* copy old pkey_reg */ + new_pkey_reg = old_pkey_reg; /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); + new_pkey_reg &= ~(mask << (pkey * PKEY_BITS_PER_PKEY)); /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); + new_pkey_reg |= (rights << (pkey * PKEY_BITS_PER_PKEY)); - __wrpkru(new_pkru); + __write_pkey_reg(new_pkey_reg); - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %x old_pkey_reg: %x\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); return 0; } @@ -449,7 +454,7 @@ void pkey_disable_set(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights; - u32 orig_pkru = rdpkru(); + u32 orig_pkey_reg = read_pkey_reg(); dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@ -465,9 +470,9 @@ void pkey_disable_set(int pkey, int flags) ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg |= flags << (pkey * 2); + dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkey_reg); pkey_assert(ret >= 0); @@ -475,9 +480,9 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, pkey, pkey, pkey_rights); - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); + dprintf1("%s(%d) pkey_reg: 0x%x\n", __func__, pkey, read_pkey_reg()); if (flags) - pkey_assert(rdpkru() > orig_pkru); + pkey_assert(read_pkey_reg() > orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -487,7 +492,7 @@ void pkey_disable_clear(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); + u32 orig_pkey_reg = read_pkey_reg(); pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); @@ -498,17 +503,16 @@ void pkey_disable_clear(int pkey, int flags) pkey_rights |= flags; ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); + shadow_pkey_reg &= ~(flags << (pkey * 2)); pkey_assert(ret >= 0); pkey_rights = hw_pkey_get(pkey, syscall_flags); dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, pkey, pkey, pkey_rights); - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); + dprintf1("%s(%d) pkey_reg: 0x%x\n", __func__, pkey, read_pkey_reg()); if (flags) - assert(rdpkru() > orig_pkru); + assert(read_pkey_reg() > orig_pkey_reg); } void pkey_write_allow(int pkey) @@ -561,33 +565,38 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, + __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + shadow_pkey_reg &= ~(0x3 << (ret * 2)); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); /* * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) + * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkru |= (init_val << (ret * 2)); + shadow_pkey_reg |= (init_val << (ret * 2)); } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); return ret; } @@ -638,8 +647,8 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -657,11 +666,13 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); } pkey_assert(pkey < NR_PKEYS); @@ -669,8 +680,8 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -752,7 +763,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) void *ptr; int ret; - rdpkru(); + read_pkey_reg(); dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, size, prot, pkey); pkey_assert(pkey < NR_PKEYS); @@ -761,7 +772,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); pkey_assert(!ret); record_pkey_malloc(ptr, size, prot); - rdpkru(); + read_pkey_reg(); dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); return ptr; @@ -924,14 +935,14 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkru_faults; +int last_pkey_faults; #define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) +void expected_pkey_fault(int pkey) { - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_pkey_faults + 1 == pkey_faults); /* * For exec-only memory, we do not know the pkey in @@ -941,23 +952,23 @@ void expected_pk_fault(int pkey) pkey_assert(last_si_pkey == pkey); /* - * The signal handler shold have cleared out PKRU to let the + * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ - if (__rdpkru() != 0) + if (__read_pkey_reg() != 0) pkey_assert(0); - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%x to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; last_si_pkey = -1; } -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) int test_fds[10] = { -1 }; @@ -1015,25 +1026,25 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) int ptr_contents; dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); + read_pkey_reg(); pkey_access_deny(pkey); ptr_contents = read_ptr(ptr); dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { @@ -1160,9 +1171,10 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ + dprintf4("%s()::%d, err: %d pkey_reg: 0x%x shadow: 0x%x\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); if ((new_pkey == -1) && (errno == ENOSPC)) { dprintf2("%s() failed to allocate pkey after %d tries\n", @@ -1201,7 +1213,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); pkey_assert(!err); - rdpkru(); /* for shadow checking */ + read_pkey_reg(); /* for shadow checking */ } } @@ -1287,7 +1299,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect an exception: */ peek_result = read_ptr(ptr); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); /* * Try to access the NON-pkey-protected "plain_ptr" via ptrace: @@ -1297,7 +1309,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect NO exception: */ peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); + do_not_expect_pkey_fault("read plain pointer after ptrace"); ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); pkey_assert(ret != -1); @@ -1347,17 +1359,17 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %x\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); ptr_contents = read_ptr(p1); dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1378,15 +1390,15 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkru: %x\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); ptr_contents = read_ptr(p1); dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); + expected_pkey_fault(UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the @@ -1400,7 +1412,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); pkey_assert(!ret); ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) @@ -1442,7 +1454,7 @@ void run_tests_once(void) for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; - int orig_pkru_faults = pkru_faults; + int orig_pkey_faults = pkey_faults; dprintf1("======================\n"); dprintf1("test %d preparing...\n", test_nr); @@ -1457,8 +1469,8 @@ void run_tests_once(void) free_pkey_malloc(ptr); sys_pkey_free(pkey); - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); tracing_off(); close_test_fds(); @@ -1471,7 +1483,7 @@ void run_tests_once(void) void pkey_setup_shadow(void) { - shadow_pkru = __rdpkru(); + shadow_pkey_reg = __read_pkey_reg(); } int main(void) @@ -1495,7 +1507,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); + printf("startup pkey_reg: %x\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) -- cgit v1.2.3 From 5461c6625f2961ec21541604c9043c688aa176e0 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:51:41 -0700 Subject: selftests/vm/pkeys: move generic definitions to header file Moved all the generic definition and helper functions to the header file. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/57177f99e92a51295956715d5f2d5688a4d13927.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 35 ++++++++++++++++++++++++---- tools/testing/selftests/vm/protection_keys.c | 27 --------------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index d5779be4793f..6ad1bd54ef94 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -13,6 +13,14 @@ #include #include +/* Define some kernel-like types */ +#define u8 uint8_t +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + #define NR_PKEYS 16 #define PKEY_BITS_PER_PKEY 2 @@ -53,6 +61,18 @@ static inline void sigsafe_printf(const char *format, ...) #define dprintf3(args...) dprintf_level(3, args) #define dprintf4(args...) dprintf_level(4, args) +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + extern unsigned int shadow_pkey_reg; static inline unsigned int __read_pkey_reg(void) { @@ -137,11 +157,6 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write) dprintf4("pkey_reg now: %08x\n", read_pkey_reg()); } -#define PROT_PKEY0 0x10 /* protection key value (bit 0) */ -#define PROT_PKEY1 0x20 /* protection key value (bit 1) */ -#define PROT_PKEY2 0x40 /* protection key value (bit 2) */ -#define PROT_PKEY3 0x80 /* protection key value (bit 3) */ - #define PAGE_SIZE 4096 #define MB (1<<20) @@ -219,4 +234,14 @@ int pkey_reg_xstate_offset(void) return xstate_offset; } +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + #endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 2f4ab81c570d..42ffb58810f2 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -51,31 +51,10 @@ int test_nr; unsigned int shadow_pkey_reg; #define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -186,12 +165,6 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t - #ifdef __i386__ #ifndef SYS_mprotect_key -- cgit v1.2.3 From 53555e2b4d9a29ff93d9bedfe209328cc69806be Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 4 Jun 2020 16:51:44 -0700 Subject: selftests/vm/pkeys: move some definitions to arch-specific header In preparation for multi-arch support, move definitions which have arch-specific values to x86-specific header. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/d58eba2930059c8b209eefd6d5b48fe922a5b010.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 111 ++----------------- tools/testing/selftests/vm/pkey-x86.h | 156 +++++++++++++++++++++++++++ tools/testing/selftests/vm/protection_keys.c | 47 -------- 3 files changed, 162 insertions(+), 152 deletions(-) create mode 100644 tools/testing/selftests/vm/pkey-x86.h diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 6ad1bd54ef94..3ed2f021bf7a 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -21,9 +21,6 @@ #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) -#define NR_PKEYS 16 -#define PKEY_BITS_PER_PKEY 2 - #ifndef DEBUG_LEVEL #define DEBUG_LEVEL 0 #endif @@ -73,19 +70,13 @@ extern void abort_hooks(void); } \ } while (0) +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + extern unsigned int shadow_pkey_reg; -static inline unsigned int __read_pkey_reg(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkey_reg; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkey_reg = eax; - return pkey_reg; -} static inline unsigned int _read_pkey_reg(int line) { @@ -100,19 +91,6 @@ static inline unsigned int _read_pkey_reg(int line) #define read_pkey_reg() _read_pkey_reg(__LINE__) -static inline void __write_pkey_reg(unsigned int pkey_reg) -{ - unsigned int eax = pkey_reg; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, - __read_pkey_reg(), pkey_reg); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkey_reg == __read_pkey_reg()); -} - static inline void write_pkey_reg(unsigned int pkey_reg) { dprintf4("%s() changing %08x to %08x\n", __func__, @@ -157,83 +135,6 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write) dprintf4("pkey_reg now: %08x\n", read_pkey_reg()); } -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKEY_BIT (9) -#define XSTATE_PKEY 0x200 - -int pkey_reg_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKEY is set in XCR0 */ - leaf = XSTATE_PKEY_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (leaf == XSTATE_PKEY_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKEY in xsave state\n"); - return 0; - } - - return xstate_offset; -} - #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index 000000000000..2f04ade8ca9c --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline unsigned int __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned int pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(unsigned int pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %08x to %08x\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pku(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + eax = 0x7; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + eax = XSTATE_CPUID; + ecx = leaf; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 42ffb58810f2..2d0e881f109d 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -49,9 +49,6 @@ int iteration_nr = 1; int test_nr; unsigned int shadow_pkey_reg; - -#define HPAGE_SIZE (1UL<<21) - int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; @@ -137,12 +134,6 @@ void abort_hooks(void) #endif } -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - /* * This attempts to have roughly a page of instructions followed by a few * instructions that do a write, and another page of instructions. That @@ -165,36 +156,6 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -#ifdef __i386__ - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif - -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 - -#else - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif - -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 - -#endif - void dump_mem(void *dumpme, int len_bytes) { char *c = (void *)dumpme; @@ -367,14 +328,6 @@ pid_t fork_lazy_child(void) return forkret; } -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - static u32 hw_pkey_get(int pkey, unsigned long flags) { u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); -- cgit v1.2.3 From a09160e694ccd822365da28f9e5f94d22a6784b0 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 4 Jun 2020 16:51:47 -0700 Subject: selftests/vm/pkeys: make gcc check arguments of sigsafe_printf() This will help us ensure we print pkey_reg_t values correctly in different architectures. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: "Desnes A. Nunes do Rosario" Cc: Florian Weimer Cc: Ingo Molnar Cc: Ram Pai Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/b40b7a95fdd4045d62530a2a34452934caf3b0bc.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 3ed2f021bf7a..7f18a82e54fc 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -27,6 +27,10 @@ #define DPRINT_IN_SIGNAL_BUF_SIZE 4096 extern int dprint_in_signal; extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif static inline void sigsafe_printf(const char *format, ...) { va_list ap; -- cgit v1.2.3 From 4dbdd947cb7f5534bedfdd1dbf983d0c0d9def29 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 4 Jun 2020 16:51:51 -0700 Subject: selftests: vm: pkeys: Use sane types for pkey register The size of the pkey register can vary across architectures. This converts the data type of all its references to u64 in preparation for multi-arch support. To keep the definition of the u64 type consistent and remove format specifier related warnings, __SANE_USERSPACE_TYPES__ is defined as suggested by Michael Ellerman. Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: "Desnes A. Nunes do Rosario" Cc: Florian Weimer Cc: Ingo Molnar Cc: Ram Pai Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/d3e271798455d940e395e56e1ff1e82a31bcb7aa.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 31 +++++----- tools/testing/selftests/vm/pkey-x86.h | 8 +-- tools/testing/selftests/vm/protection_keys.c | 86 +++++++++++++++++----------- 3 files changed, 72 insertions(+), 53 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 7f18a82e54fc..dfbce49269ce 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -14,10 +14,10 @@ #include /* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) @@ -80,13 +80,14 @@ extern void abort_hooks(void); #error Architecture not supported #endif /* arch */ -extern unsigned int shadow_pkey_reg; +extern u64 shadow_pkey_reg; -static inline unsigned int _read_pkey_reg(int line) +static inline u64 _read_pkey_reg(int line) { - unsigned int pkey_reg = __read_pkey_reg(); + u64 pkey_reg = __read_pkey_reg(); - dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n", + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", line, pkey_reg, shadow_pkey_reg); assert(pkey_reg == shadow_pkey_reg); @@ -95,15 +96,15 @@ static inline unsigned int _read_pkey_reg(int line) #define read_pkey_reg() _read_pkey_reg(__LINE__) -static inline void write_pkey_reg(unsigned int pkey_reg) +static inline void write_pkey_reg(u64 pkey_reg) { - dprintf4("%s() changing %08x to %08x\n", __func__, + dprintf4("%s() changing %016llx to %016llx\n", __func__, __read_pkey_reg(), pkey_reg); /* will do the shadow check for us: */ read_pkey_reg(); __write_pkey_reg(pkey_reg); shadow_pkey_reg = pkey_reg; - dprintf4("%s(%08x) pkey_reg: %08x\n", __func__, + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, pkey_reg, __read_pkey_reg()); } @@ -113,7 +114,7 @@ static inline void write_pkey_reg(unsigned int pkey_reg) */ static inline void __pkey_access_allow(int pkey, int do_allow) { - unsigned int pkey_reg = read_pkey_reg(); + u64 pkey_reg = read_pkey_reg(); int bit = pkey * 2; if (do_allow) @@ -121,13 +122,13 @@ static inline void __pkey_access_allow(int pkey, int do_allow) else pkey_reg |= (1< #include #include @@ -48,7 +49,7 @@ int iteration_nr = 1; int test_nr; -unsigned int shadow_pkey_reg; +u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; @@ -163,7 +164,7 @@ void dump_mem(void *dumpme, int len_bytes) for (i = 0; i < len_bytes; i += sizeof(u64)) { u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); } } @@ -205,7 +206,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__, + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; @@ -213,8 +215,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) fpregset = uctxt->uc_mcontext.fpregs; fpregs = (void *)fpregset; - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -256,8 +259,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * need __read_pkey_reg() version so we do not do shadow_pkey_reg * checking */ - dprintf1("signal pkey_reg from pkey_reg: %08x\n", __read_pkey_reg()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); *(u64 *)pkey_reg_ptr = 0x00000000; dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); pkey_faults++; @@ -331,16 +335,17 @@ pid_t fork_lazy_child(void) static u32 hw_pkey_get(int pkey, unsigned long flags) { u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkey_reg = __read_pkey_reg(); - u32 shifted_pkey_reg; + u64 pkey_reg = __read_pkey_reg(); + u64 shifted_pkey_reg; u32 masked_pkey_reg; dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkey_reg: %x\n", __func__, pkey_reg); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); shifted_pkey_reg = (pkey_reg >> (pkey * PKEY_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkey_reg: %x\n", __func__, shifted_pkey_reg); + dprintf2("%s() shifted_pkey_reg: %016llx\n", __func__, + shifted_pkey_reg); masked_pkey_reg = shifted_pkey_reg & mask; dprintf2("%s() masked pkey_reg: %x\n", __func__, masked_pkey_reg); /* @@ -353,8 +358,8 @@ static u32 hw_pkey_get(int pkey, unsigned long flags) static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) { u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkey_reg = __read_pkey_reg(); - u32 new_pkey_reg; + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; /* make sure that 'rights' only contains the bits we expect: */ assert(!(rights & ~mask)); @@ -369,7 +374,7 @@ static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) __write_pkey_reg(new_pkey_reg); dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" - " pkey_reg now: %x old_pkey_reg: %x\n", + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", __func__, pkey, rights, flags, 0, __read_pkey_reg(), old_pkey_reg); return 0; @@ -380,7 +385,7 @@ void pkey_disable_set(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights; - u32 orig_pkey_reg = read_pkey_reg(); + u64 orig_pkey_reg = read_pkey_reg(); dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@ -390,6 +395,7 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); pkey_rights |= flags; @@ -398,7 +404,8 @@ void pkey_disable_set(int pkey, int flags) assert(!ret); /* pkey_reg and flags have the same format */ shadow_pkey_reg |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkey_reg); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); pkey_assert(ret >= 0); @@ -406,7 +413,8 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, pkey, pkey, pkey_rights); - dprintf1("%s(%d) pkey_reg: 0x%x\n", __func__, pkey, read_pkey_reg()); + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); if (flags) pkey_assert(read_pkey_reg() > orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, @@ -418,7 +426,7 @@ void pkey_disable_clear(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkey_reg = read_pkey_reg(); + u64 orig_pkey_reg = read_pkey_reg(); pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); @@ -436,7 +444,8 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, pkey, pkey, pkey_rights); - dprintf1("%s(%d) pkey_reg: 0x%x\n", __func__, pkey, read_pkey_reg()); + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); if (flags) assert(read_pkey_reg() > orig_pkey_reg); } @@ -491,20 +500,22 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, - __LINE__, __read_pkey_reg(), shadow_pkey_reg); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* * pkey_alloc() sets PKEY register, so we need to reflect it in * shadow_pkey_reg: */ - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); if (ret) { /* clear both the bits: */ shadow_pkey_reg &= ~(0x3 << (ret * 2)); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); @@ -514,13 +525,15 @@ int alloc_pkey(void) */ shadow_pkey_reg |= (init_val << (ret * 2)); } - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ read_pkey_reg(); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; @@ -573,7 +586,8 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", __func__, + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -592,11 +606,13 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); } @@ -606,7 +622,8 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", __func__, + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -885,7 +902,7 @@ void expected_pkey_fault(int pkey) pkey_assert(0); __write_pkey_reg(shadow_pkey_reg); - dprintf1("%s() set pkey_reg=%x to restore state after signal " + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " "nuked it\n", __func__, shadow_pkey_reg); last_pkey_faults = pkey_faults; last_si_pkey = -1; @@ -1097,7 +1114,8 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkey_reg: 0x%x shadow: 0x%x\n", + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, __LINE__, err, __read_pkey_reg(), shadow_pkey_reg); read_pkey_reg(); /* for shadow checking */ @@ -1285,7 +1303,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkey_reg: %x\n", read_pkey_reg()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault @@ -1316,7 +1334,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", read_pkey_reg()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); @@ -1433,7 +1451,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkey_reg: %x\n", read_pkey_reg()); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) -- cgit v1.2.3 From 0c416bcaef8dc9b8378b3e7fa7ce3e9ad5cedcd0 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 4 Jun 2020 16:51:54 -0700 Subject: selftests: vm: pkeys: add helpers for pkey bits This introduces some functions that help with setting or clearing bits of a particular pkey. This also adds an abstraction for getting a pkey's bit position in the pkey register as this may vary across architectures. Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: "Desnes A. Nunes do Rosario" Cc: Florian Weimer Cc: Ingo Molnar Cc: Ram Pai Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/2ad9705f4f68ca7e72155cc583415e5a979546f1.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 22 +++++++++++++++++++ tools/testing/selftests/vm/pkey-x86.h | 5 +++++ tools/testing/selftests/vm/protection_keys.c | 32 ++++++++-------------------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index dfbce49269ce..0e3da7c8d628 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -80,6 +80,28 @@ extern void abort_hooks(void); #error Architecture not supported #endif /* arch */ +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + extern u64 shadow_pkey_reg; static inline u64 _read_pkey_reg(int line) diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index 6ffea27e2d2d..def2a1bcf6a5 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -118,6 +118,11 @@ static inline int cpu_has_pku(void) return 1; } +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + #define XSTATE_PKEY_BIT (9) #define XSTATE_PKEY 0x200 diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index efa35cc6f6b9..bed9d4de12b4 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -334,25 +334,13 @@ pid_t fork_lazy_child(void) static u32 hw_pkey_get(int pkey, unsigned long flags) { - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); u64 pkey_reg = __read_pkey_reg(); - u64 shifted_pkey_reg; - u32 masked_pkey_reg; dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", __func__, pkey, flags, 0, 0); dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); - shifted_pkey_reg = (pkey_reg >> (pkey * PKEY_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkey_reg: %016llx\n", __func__, - shifted_pkey_reg); - masked_pkey_reg = shifted_pkey_reg & mask; - dprintf2("%s() masked pkey_reg: %x\n", __func__, masked_pkey_reg); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkey_reg; + return (u32) get_pkey_bits(pkey_reg, pkey); } static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) @@ -364,12 +352,8 @@ static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) /* make sure that 'rights' only contains the bits we expect: */ assert(!(rights & ~mask)); - /* copy old pkey_reg */ - new_pkey_reg = old_pkey_reg; - /* mask out bits from pkey in old value: */ - new_pkey_reg &= ~(mask << (pkey * PKEY_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkey_reg |= (rights << (pkey * PKEY_BITS_PER_PKEY)); + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); __write_pkey_reg(new_pkey_reg); @@ -403,7 +387,7 @@ void pkey_disable_set(int pkey, int flags) ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); assert(!ret); /* pkey_reg and flags have the same format */ - shadow_pkey_reg |= flags << (pkey * 2); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); dprintf1("%s(%d) shadow: 0x%016llx\n", __func__, pkey, shadow_pkey_reg); @@ -437,7 +421,7 @@ void pkey_disable_clear(int pkey, int flags) pkey_rights |= flags; ret = hw_pkey_set(pkey, pkey_rights, 0); - shadow_pkey_reg &= ~(flags << (pkey * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); pkey_assert(ret >= 0); pkey_rights = hw_pkey_get(pkey, syscall_flags); @@ -513,7 +497,8 @@ int alloc_pkey(void) shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkey_reg &= ~(0x3 << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" " shadow: 0x%016llx\n", __func__, @@ -523,7 +508,8 @@ int alloc_pkey(void) * move the new state in from init_val * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkey_reg |= (init_val << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); } dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" " shadow: 0x%016llx\n", -- cgit v1.2.3 From 11551801a71c84b9713abf1b588933eea141e362 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:51:58 -0700 Subject: selftests/vm/pkeys: fix pkey_disable_clear() Currently, pkey_disable_clear() sets the specified bits instead clearing them. This has been dead code up to now because its only callers i.e. pkey_access/write_allow() are also unused. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/1f70bca60330a85dca42c3cd98212bb1cdf5a076.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index bed9d4de12b4..4b1ddb526228 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -418,7 +418,7 @@ void pkey_disable_clear(int pkey, int flags) pkey, pkey, pkey_rights); pkey_assert(pkey_rights >= 0); - pkey_rights |= flags; + pkey_rights &= ~flags; ret = hw_pkey_set(pkey, pkey_rights, 0); shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); @@ -431,7 +431,7 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); if (flags) - assert(read_pkey_reg() > orig_pkey_reg); + assert(read_pkey_reg() < orig_pkey_reg); } void pkey_write_allow(int pkey) -- cgit v1.2.3 From ea5f95c3d6bb117abfe41fd2612f3213cf22b609 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:01 -0700 Subject: selftests/vm/pkeys: fix assertion in pkey_disable_set/clear() In some cases, a pkey's bits need not necessarily change in a way that the value of the pkey register increases when performing a pkey_disable_set() or decreases when performing a pkey_disable_clear(). For example, on powerpc, if a pkey's current state is PKEY_DISABLE_ACCESS and we perform a pkey_write_disable() on it, the bits still remain the same. We will observe something similar when the pkey's current state is 0 and a pkey_access_enable() is performed on it. Either case would cause some assertions to fail. This fixes the problem. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/8240665131e43fc93eed4eea8194676c1ea39a7f.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 4b1ddb526228..7fd52d5c4bfd 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -400,7 +400,7 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); if (flags) - pkey_assert(read_pkey_reg() > orig_pkey_reg); + pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -431,7 +431,7 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); if (flags) - assert(read_pkey_reg() < orig_pkey_reg); + assert(read_pkey_reg() <= orig_pkey_reg); } void pkey_write_allow(int pkey) -- cgit v1.2.3 From 6e373263ce07eeaa6410843179535fbdf561fc31 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:05 -0700 Subject: selftests/vm/pkeys: fix alloc_random_pkey() to make it really random alloc_random_pkey() was allocating the same pkey every time. Not all pkeys were geting tested. This fixes it. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/0162f55816d4e783a0d6e49e554d0ab9a3c9a23b.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 7fd52d5c4bfd..9cc82b65f828 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -25,6 +25,7 @@ #define __SANE_USERSPACE_TYPES__ #include #include +#include #include #include #include @@ -546,10 +547,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) -- cgit v1.2.3 From 57bcb57da241a186e8174c58850cb0e8e21f77a9 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 4 Jun 2020 16:52:08 -0700 Subject: selftests: vm: pkeys: use the correct huge page size The huge page size can vary across architectures. This will ensure that the correct huge page size is used when accessing the hugetlb controls under sysfs. Instead of using a hardcoded page size (i.e. 2MB), this now uses the HPAGE_SIZE macro which is arch-specific. Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: "Desnes A. Nunes do Rosario" Cc: Florian Weimer Cc: Ingo Molnar Cc: Ram Pai Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/66882a5d6e45c73c3a52bc4aef9754e48afa4f88.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 9cc82b65f828..535e464e27e9 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -739,12 +739,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -755,11 +758,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. + * are PMD-level pages. Someone might have made PUD-level + * pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -767,13 +775,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } -- cgit v1.2.3 From 604c496b227d300aac330aecd88ae5ffa28fbfc0 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:12 -0700 Subject: selftests/vm/pkeys: introduce generic pkey abstractions This introduces some generic abstractions and provides the corresponding architecture-specfic implementations for these abstractions. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/1c977915e69fb7767fb0dbd55ac7656554b15b93.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 12 ++++++++++++ tools/testing/selftests/vm/pkey-x86.h | 15 +++++++++++++++ tools/testing/selftests/vm/protection_keys.c | 8 ++------ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 0e3da7c8d628..621fb2a0a5ef 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -74,6 +74,9 @@ extern void abort_hooks(void); } \ } while (0) +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); + #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" #else /* arch */ @@ -172,4 +175,13 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write) #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + #endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index def2a1bcf6a5..a0c59d4f7af2 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -42,6 +42,7 @@ #endif #define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ #define PKEY_BITS_PER_PKEY 2 #define HPAGE_SIZE (1UL<<21) #define PAGE_SIZE 4096 @@ -158,4 +159,18 @@ int pkey_reg_xstate_offset(void) return xstate_offset; } +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + #endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 535e464e27e9..57c71056c93d 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1307,9 +1307,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(pkey); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1336,9 +1334,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(UNKNOWN_PKEY); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the -- cgit v1.2.3 From 589944b53b0f913b89f5b6bc50d53c38a252ba0f Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:15 -0700 Subject: selftests/vm/pkeys: introduce powerpc support This makes use of the abstractions added earlier and introduces support for powerpc. For powerpc, after receiving the SIGSEGV, the signal handler must explicitly restore access permissions for the faulting pkey to allow the test to continue. As this makes use of pkey_access_allow(), all of its dependencies and other similar functions have been moved ahead of the signal handler. [sandipan@linux.ibm.com: fix powerpc access right updates] Link: http://lkml.kernel.org/r/5f65cf37be993760de8112a88da194e3ccbb2bf8.1588959697.git.sandipan@linux.ibm.com Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/b121e9fd33789ed9195276e32fe4e80bb6b88a31.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 2 + tools/testing/selftests/vm/pkey-powerpc.h | 91 +++++++++ tools/testing/selftests/vm/protection_keys.c | 269 ++++++++++++++------------- 3 files changed, 234 insertions(+), 128 deletions(-) create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 621fb2a0a5ef..2f4b1eb3a680 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -79,6 +79,8 @@ void expected_pkey_fault(int pkey); #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" #else /* arch */ #error Architecture not supported #endif /* arch */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index 000000000000..3fded948856d --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K 3 /* pkey-0, pkey-1 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE (1UL << 16) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pku(void) +{ + return 1; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + return NR_RESERVED_PKEYS_64K; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 57c71056c93d..e6de078a9196 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -169,6 +169,125 @@ void dump_mem(void *dumpme, int len_bytes) } } +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); + + return (u32) get_pkey_bits(pkey_reg, pkey); +} + +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; + + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); + + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); + + __write_pkey_reg(new_pkey_reg); + + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} + +void pkey_disable_set(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); + + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); +} + /* Failed address bound checks: */ #ifndef SEGV_BNDERR # define SEGV_BNDERR 3 @@ -199,11 +318,12 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) int trapno; unsigned long ip; char *fpregs; +#if defined(__i386__) || defined(__x86_64__) /* arch */ u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ u64 siginfo_pkey; u32 *si_pkey_ptr; - int pkey_reg_offset; - fpregset_t fpregset; dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); @@ -213,12 +333,13 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; + fpregs = (char *) uctxt->uc_mcontext.fpregs; dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", __func__, trapno, ip, si_code_str(si->si_code), si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -226,12 +347,10 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * state. We just assume that it is here. */ fpregs += 0x70; -#endif +#endif /* i386 */ pkey_reg_offset = pkey_reg_xstate_offset(); pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); /* * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. @@ -240,6 +359,10 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) if (DEBUG_LEVEL > 4) dump_mem(pkey_reg_ptr - 128, 256); pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -248,14 +371,13 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) exit(4); } - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + si_pkey_ptr = siginfo_get_pkey_ptr(si); dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); dump_mem((u8 *)si_pkey_ptr - 8, 24); siginfo_pkey = *si_pkey_ptr; pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); /* * need __read_pkey_reg() version so we do not do shadow_pkey_reg * checking @@ -263,8 +385,14 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) dprintf1("signal pkey_reg from pkey_reg: %016llx\n", __read_pkey_reg()); dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); *(u64 *)pkey_reg_ptr = 0x00000000; dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; @@ -333,125 +461,6 @@ pid_t fork_lazy_child(void) return forkret; } -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); - - return (u32) get_pkey_bits(pkey_reg, pkey); -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u64 old_pkey_reg = __read_pkey_reg(); - u64 new_pkey_reg; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* modify bits accordingly in old pkey_reg and assign it */ - new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); - - __write_pkey_reg(new_pkey_reg); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" - " pkey_reg now: %016llx old_pkey_reg: %016llx\n", - __func__, pkey, rights, flags, 0, __read_pkey_reg(), - old_pkey_reg); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /* pkey_reg and flags have the same format */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - dprintf1("%s(%d) shadow: 0x%016llx\n", - __func__, pkey, shadow_pkey_reg); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", - __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights &= ~flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, - pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey) { @@ -890,11 +899,15 @@ void expected_pkey_fault(int pkey) if (pkey != UNKNOWN_PKEY) pkey_assert(last_si_pkey == pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ pkey_assert(0); __write_pkey_reg(shadow_pkey_reg); -- cgit v1.2.3 From c63e5e7f9942ac18bc6cc7e8853ad709e72e9f8b Mon Sep 17 00:00:00 2001 From: "Desnes A. Nunes do Rosario" Date: Thu, 4 Jun 2020 16:52:19 -0700 Subject: selftests/vm/pkeys: fix number of reserved powerpc pkeys The number of reserved pkeys in a PowerNV environment is different from that on PowerVM or KVM. Tested on PowerVM and PowerNV environments. Signed-off-by: "Desnes A. Nunes do Rosario" Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Florian Weimer Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/0341a0ca961166814b44c9e724774672c18d54ca.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-powerpc.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index 3fded948856d..7ad283d4524e 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -28,7 +28,10 @@ #define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey and 24 other keys that cannot be represented in the PTE */ -#define NR_RESERVED_PKEYS_64K 3 /* pkey-0, pkey-1 and exec-only-pkey */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ #define PKEY_BITS_PER_PKEY 2 #define HPAGE_SIZE (1UL << 24) #define PAGE_SIZE (1UL << 16) @@ -66,12 +69,27 @@ static inline int cpu_has_pku(void) return 1; } +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + static inline int get_arch_reserved_keys(void) { if (sysconf(_SC_PAGESIZE) == 4096) return NR_RESERVED_PKEYS_4K; else - return NR_RESERVED_PKEYS_64K; + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; } void expect_fault_on_read_execonly_key(void *p1, int pkey) -- cgit v1.2.3 From b0acc5d6bf333583b535f5caf1539d90c78519c2 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:22 -0700 Subject: selftests/vm/pkeys: fix assertion in test_pkey_alloc_exhaust() Some pkeys which are valid on the hardware are reserved and not available for application use. These keys cannot be allocated. test_pkey_alloc_exhaust() tries to account for these and has an assertion which validates if all available pkeys have been exahaustively allocated. However, the expression that is currently used is only valid for x86. On powerpc, a pkey is additionally reserved as compared to x86. Hence, the assertion is made to use an arch-specific helper to get the correct count of reserved pkeys. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/38b08d0318820ae46af3aa6048384fd8056c3df7.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index e6de078a9196..5fcbbc525364 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1153,6 +1153,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* + * On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1160,8 +1161,16 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); -- cgit v1.2.3 From 94c8a223ded59918536387a9c33fee29ca54fc7e Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:25 -0700 Subject: selftests/vm/pkeys: improve checks to determine pkey support For the pkeys subsystem to work, both the CPU and the kernel need to have support. So, additionally check if the kernel supports pkeys apart from the CPU feature checks. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/8fb76c63ebdadcf068ecd2d23731032e195cd364.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 30 ++++++++++++++++++++++++++++ tools/testing/selftests/vm/pkey-powerpc.h | 3 ++- tools/testing/selftests/vm/pkey-x86.h | 2 +- tools/testing/selftests/vm/protection_keys.c | 7 ++++--- 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 2f4b1eb3a680..59ccdff18214 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -76,6 +76,8 @@ extern void abort_hooks(void); __attribute__((noinline)) int read_ptr(int *ptr); void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" @@ -186,4 +188,32 @@ static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) #endif } +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + #endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index 7ad283d4524e..1f82caa6293b 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -64,8 +64,9 @@ static inline void __write_pkey_reg(u64 pkey_reg) __func__, __read_pkey_reg(), pkey_reg); } -static inline int cpu_has_pku(void) +static inline int cpu_has_pkeys(void) { + /* No simple way to determine this */ return 1; } diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index a0c59d4f7af2..6421b846aa16 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -97,7 +97,7 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, #define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ -static inline int cpu_has_pku(void) +static inline int cpu_has_pkeys(void) { unsigned int eax; unsigned int ebx; diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 5fcbbc525364..95f173049f43 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1378,7 +1378,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1447,12 +1447,13 @@ void pkey_setup_shadow(void) int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; -- cgit v1.2.3 From aef759db63fd4cfad5b585db96cf05f34d04cbd9 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:29 -0700 Subject: selftests/vm/pkeys: associate key on a mapped page and detect access violation Detect access-violation on a page to which access-disabled key is associated much after the page is mapped. Signed-off-by: Ram Pai Signed-off: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/4a19cf9252c03dd883887e9002881599e6900d06.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 95f173049f43..f65d384ef6a0 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -984,6 +984,24 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) dprintf1("*ptr: %d\n", ptr_contents); expected_pkey_fault(pkey); } + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); @@ -1390,6 +1408,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, test_write_of_access_disabled_region, test_kernel_write_of_access_disabled_region, -- cgit v1.2.3 From 39351c1326cf72d9b01c5faa3de23153727832f5 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:32 -0700 Subject: selftests/vm/pkeys: associate key on a mapped page and detect write violation Detect write-violation on a page to which write-disabled key is associated much after the page is mapped. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/6bfe3b3832f8bcfb07d7f2cf116b45197f4587dd.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index f65d384ef6a0..cb31a5cdf6d9 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1002,6 +1002,17 @@ void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); @@ -1410,6 +1421,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_access_disabled_region, test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, -- cgit v1.2.3 From 4e06e718afd71f99592a76a29167e9cd617b2b09 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:36 -0700 Subject: selftests/vm/pkeys: detect write violation on a mapped access-denied-key page Detect write-violation on a page to which access-disabled key is associated much after the page is mapped. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/6a7dd4069ee18a2a51b207a55aa197f3f3c59753.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index cb31a5cdf6d9..8bb4de103874 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1027,6 +1027,18 @@ void test_write_of_access_disabled_region(int *ptr, u16 pkey) *ptr = __LINE__; expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1423,6 +1435,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_write_of_write_disabled_region, test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, -- cgit v1.2.3 From 6e2c2d0fb7819a05765147286ed71d5bb96faa36 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:39 -0700 Subject: selftests/vm/pkeys: introduce a sub-page allocator This introduces a new allocator that allocates 4K hardware pages to back 64K linux pages. This allocator is available only on powerpc. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/c4a82fa962ec71015b994fab1aaf83bdfd091553.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-helpers.h | 6 ++++++ tools/testing/selftests/vm/pkey-powerpc.h | 25 +++++++++++++++++++++++++ tools/testing/selftests/vm/pkey-x86.h | 5 +++++ tools/testing/selftests/vm/protection_keys.c | 1 + 4 files changed, 37 insertions(+) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 59ccdff18214..622a85848f61 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -28,6 +28,9 @@ extern int dprint_in_signal; extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; +extern int test_nr; +extern int iteration_nr; + #ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif @@ -78,6 +81,9 @@ __attribute__((noinline)) int read_ptr(int *ptr); void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index 1f82caa6293b..2f7174ee90e5 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -107,4 +107,29 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey) /* 4-byte instructions * 16384 = 64K page */ #define __page_o_noops() asm(".rept 16384 ; nop; .endr") +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + #endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index 6421b846aa16..3be20f5d5275 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -173,4 +173,9 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey) expected_pkey_fault(pkey); } +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + #endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 8bb4de103874..d4952b57cc90 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -845,6 +845,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: -- cgit v1.2.3 From fa17437cb8405bd0904f0aff8fb80d5e8c854b04 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:43 -0700 Subject: selftests/vm/pkeys: test correct behaviour of pkey-0 Ensure that pkey-0 is allocated on start and that it can be attached dynamically in various modes, without failures. Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/9b7c54a9b4261894fe0c7e884c70b87214ff8fbb.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/protection_keys.c | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index d4952b57cc90..a1cb9a71e77c 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -964,6 +964,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1448,6 +1500,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) -- cgit v1.2.3 From e9506394a159a449bdf0427b71d5191b4f7fc618 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 4 Jun 2020 16:52:46 -0700 Subject: selftests/vm/pkeys: override access right definitions on powerpc Some platforms hardcode the x86 values for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE such as those in: /usr/include/bits/mman-shared.h. This overrides the definitions with correct values for powerpc. [sandipan@linux.ibm.com: fix powerpc access right definitions] Link: http://lkml.kernel.org/r/1ba86fd8a94f38131cfe2d9f277001dd1ad1d34e.1588959697.git.sandipan@linux.ibm.com Signed-off-by: Ram Pai Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: Dave Hansen Cc: Florian Weimer Cc: "Desnes A. Nunes do Rosario" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/f6eb38cb3a1e12eb2cdc9da6300bc5a5dfba0db9.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-powerpc.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index 2f7174ee90e5..e4aa5977388b 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -16,13 +16,11 @@ #define fpregs fp_regs #define si_pkey_offset 0x20 -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ -#endif +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 #define NR_PKEYS 32 #define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey -- cgit v1.2.3 From 473c3cc86c36026d22129660746f2f4447abb79d Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 4 Jun 2020 16:52:50 -0700 Subject: selftests: vm: pkeys: use the correct page size on powerpc Both 4K and 64K pages are supported on powerpc. Parts of the selftest code perform alignment computations based on the PAGE_SIZE macro which is currently hardcoded to 64K for powerpc. This causes some test failures on kernels configured with 4K page size. In some cases, we need to enforce function alignment on page size. Since this can only be done at build time, 64K is used as the alignment factor as that also ensures 4K alignment. Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Acked-by: Dave Hansen Cc: "Desnes A. Nunes do Rosario" Cc: Florian Weimer Cc: Ingo Molnar Cc: Ram Pai Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/5dcdfbf3353acdc90f315172e800b49f5ca21299.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/pkey-powerpc.h | 2 +- tools/testing/selftests/vm/protection_keys.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index e4aa5977388b..1ebb586b2fbc 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -32,7 +32,7 @@ pkey-31 and exec-only key */ #define PKEY_BITS_PER_PKEY 2 #define HPAGE_SIZE (1UL << 24) -#define PAGE_SIZE (1UL << 16) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) static inline u32 pkey_bit_position(int pkey) { diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index a1cb9a71e77c..fc19addcb5c8 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -146,7 +146,12 @@ void abort_hooks(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else __attribute__((__aligned__(PAGE_SIZE))) +#endif void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); -- cgit v1.2.3 From f21fda8f64533787db3e3682fee5775eb0bbea5f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 4 Jun 2020 16:52:54 -0700 Subject: selftests: vm: pkeys: fix multilib builds for x86 This ensures that both 32-bit and 64-bit binaries are generated when this is built on a x86_64 system. Most of the changes have been borrowed from tools/testing/selftests/x86/Makefile. Signed-off-by: Sandipan Das Signed-off-by: Andrew Morton Tested-by: Dave Hansen Acked-by: Dave Hansen Cc: "Desnes A. Nunes do Rosario" Cc: Florian Weimer Cc: Ingo Molnar Cc: Ram Pai Cc: Thiago Jung Bauermann Cc: "Aneesh Kumar K.V" Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Suchanek Cc: Shuah Khan Link: http://lkml.kernel.org/r/0326a442214d7a1b970d38296e63df3b217f5912.1585646528.git.sandipan@linux.ibm.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 74 ++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c3b559ea97c5..a9026706d597 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -15,7 +15,6 @@ TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests -TEST_GEN_FILES += protection_keys TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen @@ -23,6 +22,30 @@ TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd TEST_GEN_FILES += khugepaged +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) + +TARGETS := protection_keys +BINARIES_32 := $(TARGETS:%=%_32) +BINARIES_64 := $(TARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else +TEST_GEN_FILES += protection_keys +endif + ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range @@ -38,6 +61,55 @@ include ../lib.mk $(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): %_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): %_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + $(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap -- cgit v1.2.3 From 2792d488a2d36c80e8beb34d710d10d144b66c22 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Thu, 4 Jun 2020 16:52:57 -0700 Subject: tools/testing/selftests/vm: remove duplicate headers Code cleanup: Remove duplicate headers which are included twice. Signed-off-by: Jagadeesh Pagadala Signed-off-by: Andrew Morton Cc: Shuah Khan Cc: Brian Geffon Link: http://lkml.kernel.org/r/1587278984-18847-1-git-send-email-jagdsh.linux@gmail.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mremap_dontunmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index ee06cb0b9efb..3a7b5ef0b0c6 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "../kselftest.h" -- cgit v1.2.3 From 469cbd016157d28c27fda8da6ddc76b856f4e1b9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Jun 2020 16:53:00 -0700 Subject: lib/ubsan.c: fix gcc-10 warnings The latest compiler expects slightly different function prototypes for the ubsan helpers: lib/ubsan.c:192:6: error: conflicting types for built-in function '__ubsan_handle_add_overflow'; expected 'void(void *, void *, void *)' [-Werror=builtin-declaration-mismatch] 192 | void __ubsan_handle_add_overflow(struct overflow_data *data, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ lib/ubsan.c:200:6: error: conflicting types for built-in function '__ubsan_handle_sub_overflow'; expected 'void(void *, void *, void *)' [-Werror=builtin-declaration-mismatch] 200 | void __ubsan_handle_sub_overflow(struct overflow_data *data, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ lib/ubsan.c:207:6: error: conflicting types for built-in function '__ubsan_handle_mul_overflow'; expected 'void(void *, void *, void *)' [-Werror=builtin-declaration-mismatch] 207 | void __ubsan_handle_mul_overflow(struct overflow_data *data, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ lib/ubsan.c:214:6: error: conflicting types for built-in function '__ubsan_handle_negate_overflow'; expected 'void(void *, void *)' [-Werror=builtin-declaration-mismatch] 214 | void __ubsan_handle_negate_overflow(struct overflow_data *data, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lib/ubsan.c:234:6: error: conflicting types for built-in function '__ubsan_handle_divrem_overflow'; expected 'void(void *, void *, void *)' [-Werror=builtin-declaration-mismatch] 234 | void __ubsan_handle_divrem_overflow(struct overflow_data *data, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change the Linux implementation to match these, using a local typed pointer. Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Andrey Ryabinin Cc: Herbert Xu Cc: Julien Grall Link: http://lkml.kernel.org/r/20200429185948.4189600-1-arnd@arndb.de Signed-off-by: Linus Torvalds --- lib/ubsan.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/lib/ubsan.c b/lib/ubsan.c index f8c0ccf35f29..cb9af3f6b77e 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -189,7 +189,7 @@ static void handle_overflow(struct overflow_data *data, void *lhs, ubsan_epilogue(); } -void __ubsan_handle_add_overflow(struct overflow_data *data, +void __ubsan_handle_add_overflow(void *data, void *lhs, void *rhs) { @@ -197,23 +197,23 @@ void __ubsan_handle_add_overflow(struct overflow_data *data, } EXPORT_SYMBOL(__ubsan_handle_add_overflow); -void __ubsan_handle_sub_overflow(struct overflow_data *data, +void __ubsan_handle_sub_overflow(void *data, void *lhs, void *rhs) { handle_overflow(data, lhs, rhs, '-'); } EXPORT_SYMBOL(__ubsan_handle_sub_overflow); -void __ubsan_handle_mul_overflow(struct overflow_data *data, +void __ubsan_handle_mul_overflow(void *data, void *lhs, void *rhs) { handle_overflow(data, lhs, rhs, '*'); } EXPORT_SYMBOL(__ubsan_handle_mul_overflow); -void __ubsan_handle_negate_overflow(struct overflow_data *data, - void *old_val) +void __ubsan_handle_negate_overflow(void *_data, void *old_val) { + struct overflow_data *data = _data; char old_val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -231,9 +231,9 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data, EXPORT_SYMBOL(__ubsan_handle_negate_overflow); -void __ubsan_handle_divrem_overflow(struct overflow_data *data, - void *lhs, void *rhs) +void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) { + struct overflow_data *data = _data; char rhs_val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -326,10 +326,9 @@ void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, } EXPORT_SYMBOL(__ubsan_handle_type_mismatch); -void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, - void *ptr) +void __ubsan_handle_type_mismatch_v1(void *_data, void *ptr) { - + struct type_mismatch_data_v1 *data = _data; struct type_mismatch_data_common common_data = { .location = &data->location, .type = data->type, @@ -341,8 +340,9 @@ void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, } EXPORT_SYMBOL(__ubsan_handle_type_mismatch_v1); -void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) +void __ubsan_handle_out_of_bounds(void *_data, void *index) { + struct out_of_bounds_data *data = _data; char index_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -357,9 +357,9 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) } EXPORT_SYMBOL(__ubsan_handle_out_of_bounds); -void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data, - void *lhs, void *rhs) +void __ubsan_handle_shift_out_of_bounds(void *_data, void *lhs, void *rhs) { + struct shift_out_of_bounds_data *data = _data; struct type_descriptor *rhs_type = data->rhs_type; struct type_descriptor *lhs_type = data->lhs_type; char rhs_str[VALUE_LENGTH]; @@ -399,8 +399,9 @@ out: EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds); -void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) +void __ubsan_handle_builtin_unreachable(void *_data) { + struct unreachable_data *data = _data; ubsan_prologue(&data->location, "unreachable"); pr_err("calling __builtin_unreachable()\n"); ubsan_epilogue(); @@ -408,9 +409,9 @@ void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) } EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable); -void __ubsan_handle_load_invalid_value(struct invalid_value_data *data, - void *val) +void __ubsan_handle_load_invalid_value(void *_data, void *val) { + struct invalid_value_data *data = _data; char val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) -- cgit v1.2.3