From 329687a03d18143f491b535d22be1cccc291bb58 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 28 Apr 2022 23:15:56 -0700 Subject: tools/vm/page_owner_sort.c: use fprintf() to send error messages to stderr Error messages should be send to stderr using fprintf() instead of printf(). This work is coauthored by Yixuan Cao Shenghong Han Yinan Zhang Chongxi Zhao Yuhong Feng Yongqiang Liu Link: https://lkml.kernel.org/r/20220401024856.767-1-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Shenghong Han Cc: Yixuan Cao Cc: Yinan Zhang Cc: Chongxi Zhao Cc: Yuhong Feng Cc: Yongqiang Liu Cc: Haowen Bai Cc: Sean Anderson Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 7d98e76c2291..6771003ed5f1 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -186,7 +186,7 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); if (err != 0 || pmatch[1].rm_so == -1) { - printf("no matching pattern in %s\n", buf); + fprintf(stderr, "no matching pattern in %s\n", buf); return -1; } val_len = pmatch[1].rm_eo - pmatch[1].rm_so; @@ -202,7 +202,7 @@ static void check_regcomp(regex_t *pattern, const char *regex) err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); if (err != 0 || pattern->re_nsub != 1) { - printf("Invalid pattern %s code %d\n", regex, err); + fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); exit(1); } } @@ -251,7 +251,7 @@ static int get_page_num(char *buf) errno = 0; order_val = strtol(order_str, &endptr, 10); if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { - printf("wrong order in follow buf:\n%s\n", buf); + fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); return 0; } @@ -268,7 +268,7 @@ static pid_t get_pid(char *buf) errno = 0; pid = strtol(pid_str, &endptr, 10); if (errno != 0 || endptr == pid_str || *endptr != '\0') { - printf("wrong/invalid pid in follow buf:\n%s\n", buf); + fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); return -1; } @@ -286,7 +286,7 @@ static pid_t get_tgid(char *buf) errno = 0; tgid = strtol(tgid_str, &endptr, 10); if (errno != 0 || endptr == tgid_str || *endptr != '\0') { - printf("wrong/invalid tgid in follow buf:\n%s\n", buf); + fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); return -1; } @@ -304,7 +304,7 @@ static __u64 get_ts_nsec(char *buf) errno = 0; ts_nsec = strtoull(ts_nsec_str, &endptr, 10); if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { - printf("wrong ts_nsec in follow buf:\n%s\n", buf); + fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); return -1; } @@ -321,7 +321,7 @@ static __u64 get_free_ts_nsec(char *buf) errno = 0; free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { - printf("wrong free_ts_nsec in follow buf:\n%s\n", buf); + fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); return -1; } @@ -337,7 +337,7 @@ static char *get_comm(char *buf) search_pattern(&comm_pattern, comm_str, buf); errno = 0; if (errno != 0) { - printf("wrong comm in follow buf:\n%s\n", buf); + fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); return NULL; } @@ -373,7 +373,7 @@ static void add_list(char *buf, int len) return; } if (list_size == max_size) { - printf("max_size too small??\n"); + fprintf(stderr, "max_size too small??\n"); exit(1); } if (!is_need(buf)) @@ -383,7 +383,7 @@ static void add_list(char *buf, int len) list[list_size].comm = get_comm(buf); list[list_size].txt = malloc(len+1); if (!list[list_size].txt) { - printf("Out of memory\n"); + fprintf(stderr, "Out of memory\n"); exit(1); } memcpy(list[list_size].txt, buf, len); @@ -499,7 +499,8 @@ int main(int argc, char **argv) errno = 0; fc.pid = strtol(optarg, &endptr, 10); if (errno != 0 || endptr == optarg || *endptr != '\0') { - printf("wrong/invalid pid in from the command line:%s\n", optarg); + fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", + optarg); exit(1); } break; @@ -508,7 +509,8 @@ int main(int argc, char **argv) errno = 0; fc.tgid = strtol(optarg, &endptr, 10); if (errno != 0 || endptr == optarg || *endptr != '\0') { - printf("wrong/invalid tgid in from the command line:%s\n", optarg); + fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", + optarg); exit(1); } break; @@ -519,7 +521,7 @@ int main(int argc, char **argv) break; case 4: if (!parse_cull_args(optarg)) { - printf("wrong argument after --cull in from the command line:%s\n", + fprintf(stderr, "wrong argument after --cull option:%s\n", optarg); exit(1); } @@ -554,7 +556,7 @@ int main(int argc, char **argv) list = malloc(max_size * sizeof(*list)); buf = malloc(BUF_SIZE); if (!list || !buf) { - printf("Out of memory\n"); + fprintf(stderr, "Out of memory\n"); exit(1); } -- cgit v1.2.3 From 75382a2dca0e9e9e57e88b479cf537549461a934 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 28 Apr 2022 23:15:56 -0700 Subject: tools/vm/page_owner_sort.c: support for multi-value selection in single argument When viewing page owner information, we may want to select blocks whose PID/TGID/TASK_COMM_NAME appears in a user-specified list for data analysis and aggregation. But currently page_owner_sort only supports selecting blocks associated with only one specified PID/TGID/TASK_COMM_NAME. Therefore, following adjustments are made to fix the problem: 1. Enhance selecting function to support the selection of multiple PIDs/TGIDs/TASK_COMM_NAMEs. The enhanced usages are as follows: --pid Select by pid. This selects the blocks whose PID numbers appear in . --tgid Select by tgid. This selects the blocks whose TGID numbers appear in . --name Select by task command name. This selects the blocks whose task command name appear in . Where , , are single arguments in the form of a comma-separated list,which offers a way to specify individual selecting rules. For example, if you want to select blocks whose tgids are 1, 2 or 3, you have to use 4 commands as follows: ./page_owner_sort --tgid=1 ./page_owner_sort --tgid=2 ./page_owner_sort --tgid=3 cat > With this patch, you can use only 1 command to obtain the same result as above: ./page_owner_sort --tgid=1,2,3 2. Update explanations of --pid, --tgid and --name in the function usage() and the document(Documents/vm/page_owner.rst). This work is coauthored by Yixuan Cao Shenghong Han Yinan Zhang Chongxi Zhao Yuhong Feng Yongqiang Liu Link: https://lkml.kernel.org/r/20220401024856.767-2-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Chongxi Zhao Cc: Shenghong Han Cc: Yinan Zhang Cc: Yixuan Cao Cc: Yongqiang Liu Cc: Yuhong Feng Cc: Haowen Bai Cc: Sean Anderson Signed-off-by: Andrew Morton --- Documentation/vm/page_owner.rst | 20 ++++++++--- tools/vm/page_owner_sort.c | 78 ++++++++++++++++++++++++++++++----------- 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 7e0c3f574e78..3102f91d635c 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -129,7 +129,6 @@ Usage Specify culling rules.Culling syntax is key[,key[,...]].Choose a multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. - is a single argument in the form of a comma-separated list, which offers a way to specify individual culling rules. The recognized keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. @@ -137,7 +136,6 @@ Usage the STANDARD SORT KEYS section below. Mixed use of abbreviated and complete-form of keys is allowed. - Examples: ./page_owner_sort --cull=stacktrace ./page_owner_sort --cull=st,pid,name @@ -147,9 +145,21 @@ Usage -f Filter out the information of blocks whose memory has been released. Select: - --pid Select by pid. - --tgid Select by tgid. - --name Select by task command name. + --pid Select by pid. This selects the blocks whose process ID + numbers appear in . + --tgid Select by tgid. This selects the blocks whose thread + group ID numbers appear in . + --name Select by task command name. This selects the blocks whose + task command name appear in . + + , , are single arguments in the form of a comma-separated list, + which offers a way to specify individual selecting rules. + + + Examples: + ./page_owner_sort --pid=1 + ./page_owner_sort --tgid=1,2,3 + ./page_owner_sort --name name1,name2 STANDARD FORMAT SPECIFIERS ========================== diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 6771003ed5f1..16fb034c6a4e 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -54,9 +54,12 @@ enum CULL_BIT { CULL_STACKTRACE = 1<<5 }; struct filter_condition { - pid_t tgid; - pid_t pid; - char comm[TASK_COMM_LEN]; + pid_t *tgids; + int tgids_size; + pid_t *pids; + int pids_size; + char **comms; + int comms_size; }; static struct filter_condition fc; static regex_t order_pattern; @@ -149,7 +152,6 @@ static int compare_free_ts(const void *p1, const void *p2) return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; } - static int compare_release(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -161,7 +163,6 @@ static int compare_release(const void *p1, const void *p2) return l1->free_ts_nsec ? 1 : -1; } - static int compare_cull_condition(const void *p1, const void *p2) { if (cull == 0) @@ -344,22 +345,40 @@ static char *get_comm(char *buf) return comm_str; } +static bool match_num_list(int num, int *list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (list[i] == num) + return true; + return false; +} + +static bool match_str_list(const char *str, char **list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (!strcmp(list[i], str)) + return true; + return false; +} + static bool is_need(char *buf) { if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) return false; - if ((filter & FILTER_PID) && get_pid(buf) != fc.pid) + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) return false; - if ((filter & FILTER_TGID) && get_tgid(buf) != fc.tgid) + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) return false; char *comm = get_comm(buf); if ((filter & FILTER_COMM) && - strncmp(comm, fc.comm, TASK_COMM_LEN) != 0) { + !match_str_list(comm, fc.comms, fc.comms_size)) { free(comm); return false; } + free(comm); return true; } @@ -428,6 +447,27 @@ static bool parse_cull_args(const char *arg_str) return true; } +static int *parse_nums_list(char *arg_str, int *list_size) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + int *list = calloc(size, sizeof(int)); + + errno = 0; + for (int i = 0; i < size; ++i) { + char *endptr = NULL; + + list[i] = strtol(args[i], &endptr, 10); + if (errno != 0 || endptr == args[i] || *endptr != '\0') { + free(list); + return NULL; + } + } + *list_size = size; + free_explode(args, size); + return list; +} + #define BUF_SIZE (128 * 1024) static void usage(void) @@ -442,9 +482,9 @@ static void usage(void) "-a\t\tSort by memory allocate time.\n" "-r\t\tSort by memory release time.\n" "-f\t\tFilter out the information of blocks whose memory has been released.\n" - "--pid \tSelect by pid. This selects the information of blocks whose process ID number equals to .\n" - "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID number equals to .\n" - "--name \n\t\tSelect by command name. This selects the information of blocks whose command name identical to .\n" + "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" + "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" + "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" ); } @@ -453,7 +493,7 @@ int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; FILE *fin, *fout; - char *buf, *endptr; + char *buf; int ret, i, count; struct stat st; int opt; @@ -496,9 +536,8 @@ int main(int argc, char **argv) break; case 1: filter = filter | FILTER_PID; - errno = 0; - fc.pid = strtol(optarg, &endptr, 10); - if (errno != 0 || endptr == optarg || *endptr != '\0') { + fc.pids = parse_nums_list(optarg, &fc.pids_size); + if (fc.pids == NULL) { fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", optarg); exit(1); @@ -506,9 +545,8 @@ int main(int argc, char **argv) break; case 2: filter = filter | FILTER_TGID; - errno = 0; - fc.tgid = strtol(optarg, &endptr, 10); - if (errno != 0 || endptr == optarg || *endptr != '\0') { + fc.tgids = parse_nums_list(optarg, &fc.tgids_size); + if (fc.tgids == NULL) { fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", optarg); exit(1); @@ -516,8 +554,7 @@ int main(int argc, char **argv) break; case 3: filter = filter | FILTER_COMM; - strncpy(fc.comm, optarg, TASK_COMM_LEN); - fc.comm[TASK_COMM_LEN-1] = '\0'; + fc.comms = explode(',', optarg, &fc.comms_size); break; case 4: if (!parse_cull_args(optarg)) { @@ -564,7 +601,6 @@ int main(int argc, char **argv) ret = read_block(buf, BUF_SIZE, fin); if (ret < 0) break; - add_list(buf, ret); } -- cgit v1.2.3 From ebbeae36387ccf1326c896167872c3acf6c3c956 Mon Sep 17 00:00:00 2001 From: Jiajian Ye Date: Thu, 28 Apr 2022 23:15:57 -0700 Subject: tools/vm/page_owner_sort.c: support sorting blocks by multiple keys When viewing page owner information, we may want to sort blocks of information by multiple keys, since one single key does not uniquely identify a block. Therefore, following adjustments are made: 1. Add a new --sort option to support sorting blocks of information by multiple keys. ./page_owner_sort --sort= ./page_owner_sort --sort is a single argument in the form of a comma-separated list, which offers a way to specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. The ascending or descending order can be specified by adding the + (ascending, default) or - (descend -ing) prefix to the key: ./page_owner_sort [option] --sort -key1,+key2,key3... For example, to sort the blocks first by task command name in lexicographic order and then by pid in ascending numerical order, use the following: ./page_owner_sort --sort=name,+pid To sort the blocks first by pid in ascending order and then by timestamp of the page when it is allocated in descending order, use the following: ./page_owner_sort --sort=pid,-alloc_ts 2. Add explanations of a newly added --sort option in the function usage() and the document(Documentation/vm/page_owner.rst). This work is coauthored by Yixuan Cao Shenghong Han Yinan Zhang Chongxi Zhao Yuhong Feng Yongqiang Liu Link: https://lkml.kernel.org/r/20220401024856.767-3-yejiajian2018@email.szu.edu.cn Signed-off-by: Jiajian Ye Cc: Chongxi Zhao Cc: Shenghong Han Cc: Yinan Zhang Cc: Yixuan Cao Cc: Yongqiang Liu Cc: Yuhong Feng Cc: Haowen Bai Cc: Sean Anderson Signed-off-by: Andrew Morton --- Documentation/vm/page_owner.rst | 24 +++++- tools/vm/page_owner_sort.c | 164 ++++++++++++++++++++++++++++++++++------ 2 files changed, 165 insertions(+), 23 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 3102f91d635c..523bf3419512 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -121,6 +121,14 @@ Usage -r Sort by memory release time. -s Sort by stack trace. -t Sort by times (default). + --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. + Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is + optional since default direction is increasing numerical or lexicographic + order. Mixed use of abbreviated and complete-form of keys is allowed. + + Examples: + ./page_owner_sort --sort=n,+pid,-tgid + ./page_owner_sort --sort=at additional function:: @@ -165,9 +173,23 @@ STANDARD FORMAT SPECIFIERS ========================== :: +For --sort option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + st stacktrace stack trace of the page allocation + T txt full text of block + ft free_ts timestamp of the page when it was released + at alloc_ts timestamp of the page when it was allocated + +For --curl option: + KEY LONG DESCRIPTION p pid process ID tg tgid thread group ID n name task command name f free whether the page has been released or not - st stacktrace stace trace of the page allocation + st stacktrace stack trace of the page allocation + diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 16fb034c6a4e..beca990707fb 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -53,15 +53,29 @@ enum CULL_BIT { CULL_COMM = 1<<4, CULL_STACKTRACE = 1<<5 }; +enum ARG_TYPE { + ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, + ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE +}; +enum SORT_ORDER { + SORT_ASC = 1, + SORT_DESC = -1, +}; struct filter_condition { - pid_t *tgids; - int tgids_size; pid_t *pids; - int pids_size; + pid_t *tgids; char **comms; + int pids_size; + int tgids_size; int comms_size; }; +struct sort_condition { + int (**cmps)(const void *, const void *); + int *signs; + int size; +}; static struct filter_condition fc; +static struct sort_condition sc; static regex_t order_pattern; static regex_t pid_pattern; static regex_t tgid_pattern; @@ -107,14 +121,14 @@ static int compare_num(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return l2->num - l1->num; + return l1->num - l2->num; } static int compare_page_num(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return l2->page_num - l1->page_num; + return l1->page_num - l2->page_num; } static int compare_pid(const void *p1, const void *p2) @@ -180,6 +194,16 @@ static int compare_cull_condition(const void *p1, const void *p2) return 0; } +static int compare_sort_condition(const void *p1, const void *p2) +{ + int cmp = 0; + + for (int i = 0; i < sc.size; ++i) + if (cmp == 0) + cmp = sc.signs[i] * sc.cmps[i](p1, p2); + return cmp; +} + static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) { int err, val_len; @@ -345,6 +369,29 @@ static char *get_comm(char *buf) return comm_str; } +static int get_arg_type(const char *arg) +{ + if (!strcmp(arg, "pid") || !strcmp(arg, "p")) + return ARG_PID; + else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg")) + return ARG_TGID; + else if (!strcmp(arg, "name") || !strcmp(arg, "n")) + return ARG_COMM; + else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) + return ARG_STACKTRACE; + else if (!strcmp(arg, "free") || !strcmp(arg, "f")) + return ARG_FREE; + else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) + return ARG_TXT; + else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) + return ARG_FREE_TS; + else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) + return ARG_ALLOC_TS; + else { + return ARG_UNKNOWN; + } +} + static bool match_num_list(int num, int *list, int list_size) { for (int i = 0; i < list_size; ++i) @@ -428,21 +475,86 @@ static bool parse_cull_args(const char *arg_str) int size = 0; char **args = explode(',', arg_str, &size); - for (int i = 0; i < size; ++i) - if (!strcmp(args[i], "pid") || !strcmp(args[i], "p")) + for (int i = 0; i < size; ++i) { + int arg_type = get_arg_type(args[i]); + + if (arg_type == ARG_PID) cull |= CULL_PID; - else if (!strcmp(args[i], "tgid") || !strcmp(args[i], "tg")) + else if (arg_type == ARG_TGID) cull |= CULL_TGID; - else if (!strcmp(args[i], "name") || !strcmp(args[i], "n")) + else if (arg_type == ARG_COMM) cull |= CULL_COMM; - else if (!strcmp(args[i], "stacktrace") || !strcmp(args[i], "st")) + else if (arg_type == ARG_STACKTRACE) cull |= CULL_STACKTRACE; - else if (!strcmp(args[i], "free") || !strcmp(args[i], "f")) + else if (arg_type == ARG_FREE) cull |= CULL_UNRELEASE; else { free_explode(args, size); return false; } + } + free_explode(args, size); + return true; +} + +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign) +{ + if (sc.signs == NULL || sc.size < 1) + sc.signs = calloc(1, sizeof(int)); + sc.signs[0] = sign; + if (sc.cmps == NULL || sc.size < 1) + sc.cmps = calloc(1, sizeof(int *)); + sc.cmps[0] = cmp; + sc.size = 1; +} + +static bool parse_sort_args(const char *arg_str) +{ + int size = 0; + + if (sc.size != 0) { /* reset sort_condition */ + free(sc.signs); + free(sc.cmps); + size = 0; + } + + char **args = explode(',', arg_str, &size); + + sc.signs = calloc(size, sizeof(int)); + sc.cmps = calloc(size, sizeof(int *)); + for (int i = 0; i < size; ++i) { + int offset = 0; + + sc.signs[i] = SORT_ASC; + if (args[i][0] == '-' || args[i][0] == '+') { + if (args[i][0] == '-') + sc.signs[i] = SORT_DESC; + offset = 1; + } + + int arg_type = get_arg_type(args[i]+offset); + + if (arg_type == ARG_PID) + sc.cmps[i] = compare_pid; + else if (arg_type == ARG_TGID) + sc.cmps[i] = compare_tgid; + else if (arg_type == ARG_COMM) + sc.cmps[i] = compare_comm; + else if (arg_type == ARG_STACKTRACE) + sc.cmps[i] = compare_stacktrace; + else if (arg_type == ARG_ALLOC_TS) + sc.cmps[i] = compare_ts; + else if (arg_type == ARG_FREE_TS) + sc.cmps[i] = compare_free_ts; + else if (arg_type == ARG_TXT) + sc.cmps[i] = compare_txt; + else { + free_explode(args, size); + sc.size = 0; + return false; + } + } + sc.size = size; free_explode(args, size); return true; } @@ -485,13 +597,13 @@ static void usage(void) "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" - "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" + "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" + "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" ); } int main(int argc, char **argv) { - int (*cmp)(const void *, const void *) = compare_num; FILE *fin, *fout; char *buf; int ret, i, count; @@ -502,37 +614,38 @@ int main(int argc, char **argv) { "tgid", required_argument, NULL, 2 }, { "name", required_argument, NULL, 3 }, { "cull", required_argument, NULL, 4 }, + { "sort", required_argument, NULL, 5 }, { 0, 0, 0, 0}, }; while ((opt = getopt_long(argc, argv, "afmnprstP", longopts, NULL)) != -1) switch (opt) { case 'a': - cmp = compare_ts; + set_single_cmp(compare_ts, SORT_ASC); break; case 'f': filter = filter | FILTER_UNRELEASE; break; case 'm': - cmp = compare_page_num; + set_single_cmp(compare_page_num, SORT_DESC); break; case 'p': - cmp = compare_pid; + set_single_cmp(compare_pid, SORT_ASC); break; case 'r': - cmp = compare_free_ts; + set_single_cmp(compare_free_ts, SORT_ASC); break; case 's': - cmp = compare_stacktrace; + set_single_cmp(compare_stacktrace, SORT_ASC); break; case 't': - cmp = compare_num; + set_single_cmp(compare_num, SORT_DESC); break; case 'P': - cmp = compare_tgid; + set_single_cmp(compare_tgid, SORT_ASC); break; case 'n': - cmp = compare_comm; + set_single_cmp(compare_comm, SORT_ASC); break; case 1: filter = filter | FILTER_PID; @@ -563,6 +676,13 @@ int main(int argc, char **argv) exit(1); } break; + case 5: + if (!parse_sort_args(optarg)) { + fprintf(stderr, "wrong argument after --sort option:%s\n", + optarg); + exit(1); + } + break; default: usage(); exit(1); @@ -622,7 +742,7 @@ int main(int argc, char **argv) } } - qsort(list, count, sizeof(list[0]), cmp); + qsort(list, count, sizeof(list[0]), compare_sort_condition); for (i = 0; i < count; i++) { if (cull == 0) -- cgit v1.2.3 From a72469aa593881c2a5ad3a38cfb3e7871c50f169 Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Thu, 28 Apr 2022 23:15:57 -0700 Subject: tools/vm/page_owner: support debug log to avoid huge log print As normal usage, tool will print huge parser log and spend a lot of time printing, so it would be preferable add "-d" debug control to avoid this problem. Link: https://lkml.kernel.org/r/1649672446-5685-1-git-send-email-baihaowen@meizu.com Signed-off-by: Haowen Bai Cc: Chongxi Zhao Cc: Jiajian Ye Cc: Shenghong Han Cc: Yinan Zhang Cc: Yixuan Cao Cc: Yongqiang Liu Cc: Yuhong Feng Cc: Sean Anderson Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index beca990707fb..a32e446e5bb2 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -87,6 +87,7 @@ static int list_size; static int max_size; static int cull; static int filter; +static bool debug_on; int read_block(char *buf, int buf_size, FILE *fin) { @@ -211,7 +212,8 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); if (err != 0 || pmatch[1].rm_so == -1) { - fprintf(stderr, "no matching pattern in %s\n", buf); + if (debug_on) + fprintf(stderr, "no matching pattern in %s\n", buf); return -1; } val_len = pmatch[1].rm_eo - pmatch[1].rm_so; @@ -276,7 +278,8 @@ static int get_page_num(char *buf) errno = 0; order_val = strtol(order_str, &endptr, 10); if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { - fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); + if (debug_on) + fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); return 0; } @@ -293,7 +296,8 @@ static pid_t get_pid(char *buf) errno = 0; pid = strtol(pid_str, &endptr, 10); if (errno != 0 || endptr == pid_str || *endptr != '\0') { - fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); + if (debug_on) + fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); return -1; } @@ -311,7 +315,8 @@ static pid_t get_tgid(char *buf) errno = 0; tgid = strtol(tgid_str, &endptr, 10); if (errno != 0 || endptr == tgid_str || *endptr != '\0') { - fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); + if (debug_on) + fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); return -1; } @@ -329,7 +334,8 @@ static __u64 get_ts_nsec(char *buf) errno = 0; ts_nsec = strtoull(ts_nsec_str, &endptr, 10); if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { - fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); + if (debug_on) + fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); return -1; } @@ -346,7 +352,8 @@ static __u64 get_free_ts_nsec(char *buf) errno = 0; free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { - fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); + if (debug_on) + fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); return -1; } @@ -362,7 +369,8 @@ static char *get_comm(char *buf) search_pattern(&comm_pattern, comm_str, buf); errno = 0; if (errno != 0) { - fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); + if (debug_on) + fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); return NULL; } @@ -594,6 +602,7 @@ static void usage(void) "-a\t\tSort by memory allocate time.\n" "-r\t\tSort by memory release time.\n" "-f\t\tFilter out the information of blocks whose memory has been released.\n" + "-d\t\tPrint debug information.\n" "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" @@ -618,11 +627,14 @@ int main(int argc, char **argv) { 0, 0, 0, 0}, }; - while ((opt = getopt_long(argc, argv, "afmnprstP", longopts, NULL)) != -1) + while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1) switch (opt) { case 'a': set_single_cmp(compare_ts, SORT_ASC); break; + case 'd': + debug_on = true; + break; case 'f': filter = filter | FILTER_UNRELEASE; break; -- cgit v1.2.3 From f09654bb88127473b4baf3bc0b68d4d4695aca7b Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 28 Apr 2022 23:15:57 -0700 Subject: tools/vm/page_owner_sort.c: provide allocator labelling and update --cull and --sort options An application is suspected of having memory leak when its memory consumption is high and keeps increasing. There are several commonly used memory allocators: slab, cma, vmalloc, etc. The memory leak identification can be sped up if the page information allocated by an allocator can be analyzed separately. This patch provides supports for memory allocator labelling for slab, vmalloc, and cma. The pages allocated by slab and cma can be confirmed from the "PFN" line according to the kernel codes, and the label of the vmalloc allocator can be obtained by analyzing the stack trace. Thanks for Vlastimil Babka's constructive suggestions. Based on Yinan Zhang's study, the call chain of vmalloc() is vmalloc() -> ... -> __vmalloc_node_range() -> __vmalloc_area_node(). __vmalloc_area_node() requests memory through the interface of buddy allocation system. In the current version, __vmalloc_area_node() uses four interfaces: alloc_pages_bulk_array_mempolicy(), alloc_pages_bulk_array_node(), alloc_pages() and alloc_pages_node(). By disassembling the code, we find that __vmalloc_area_node() is expanded in __vmalloc_node_range(). So __vmalloc_area_node is not in the stack trace. On the test machine, the stack trace of pages allocated by vmalloc has the following four forms: __alloc_pages_bulk+0x230/0x6a0 __vmalloc_node_range+0x19c/0x598 alloc_pages_bulk_array_mempolicy+0xbc/0x278 __vmalloc_node_range+0x1e8/0x598 __alloc_pages+0x160/0x2b0 __vmalloc_node_range+0x234/0x598 alloc_pages+0xac/0x150 __vmalloc_node_range+0x44c/0x598 Therefore, in two consecutive lines of stacktrace, if the first line contains the word "alloc_pages" and the second line contains the word "__vmalloc_node_range", it can be determined that the page is allocated by vmalloc. And the function offset and size are not the same on different machines, so there is no need to match them. At the same time, this patch updates the --cull and --sort options to support allocator-based merge statistics and sorting. The added functions are fully compatible with the original work. When using, you can use "allocator", or abbreviated as "ator". Relevant updates have also been made in the documentation(Documentation/vm/page_owner.rst). Example: ./page_owner_sort --cull=st,pid,name,allocator ./page_owner_sort --sort=ator,pid,name This work is coauthored by Jiajian Ye, Yinan Zhang, Shenghong Han, Chongxi Zhao, Yuhong Feng and Yongqiang Liu. Link: https://lkml.kernel.org/r/20220410132932.9402-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Chongxi Zhao Cc: Haowen Bai Cc: Jiajian Ye Cc: Sean Anderson Cc: Shenghong Han Cc: Yinan Zhang Cc: Yongqiang Liu Cc: Yuhong Feng Signed-off-by: Andrew Morton --- Documentation/vm/page_owner.rst | 3 +- tools/vm/page_owner_sort.c | 112 ++++++++++++++++++++++++++++++++++------ 2 files changed, 99 insertions(+), 16 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 523bf3419512..25622c715823 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -183,6 +183,7 @@ For --sort option: T txt full text of block ft free_ts timestamp of the page when it was released at alloc_ts timestamp of the page when it was allocated + ator allocator memory allocator for pages For --curl option: @@ -192,4 +193,4 @@ For --curl option: n name task command name f free whether the page has been released or not st stacktrace stack trace of the page allocation - + ator allocator memory allocator for pages diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index a32e446e5bb2..fa2e4d2a9d68 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -39,6 +39,7 @@ struct block_list { int page_num; pid_t pid; pid_t tgid; + int allocator; }; enum FILTER_BIT { FILTER_UNRELEASE = 1<<1, @@ -51,11 +52,19 @@ enum CULL_BIT { CULL_PID = 1<<2, CULL_TGID = 1<<3, CULL_COMM = 1<<4, - CULL_STACKTRACE = 1<<5 + CULL_STACKTRACE = 1<<5, + CULL_ALLOCATOR = 1<<6 +}; +enum ALLOCATOR_BIT { + ALLOCATOR_CMA = 1<<1, + ALLOCATOR_SLAB = 1<<2, + ALLOCATOR_VMALLOC = 1<<3, + ALLOCATOR_OTHERS = 1<<4 }; enum ARG_TYPE { ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, - ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE + ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, + ARG_ALLOCATOR }; enum SORT_ORDER { SORT_ASC = 1, @@ -89,15 +98,20 @@ static int cull; static int filter; static bool debug_on; -int read_block(char *buf, int buf_size, FILE *fin) +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign); + +int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin) { char *curr = buf, *const buf_end = buf + buf_size; while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { - if (*curr == '\n') /* empty line */ + if (*curr == '\n') { /* empty line */ return curr - buf; - if (!strncmp(curr, "PFN", 3)) + } + if (!strncmp(curr, "PFN", 3)) { + strcpy(ext_buf, curr); continue; + } curr += strlen(curr); } @@ -146,6 +160,13 @@ static int compare_tgid(const void *p1, const void *p2) return l1->tgid - l2->tgid; } +static int compare_allocator(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->allocator - l2->allocator; +} + static int compare_comm(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -192,6 +213,8 @@ static int compare_cull_condition(const void *p1, const void *p2) return compare_comm(p1, p2); if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) return compare_release(p1, p2); + if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) + return compare_allocator(p1, p2); return 0; } @@ -395,11 +418,42 @@ static int get_arg_type(const char *arg) return ARG_FREE_TS; else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) return ARG_ALLOC_TS; + else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) + return ARG_ALLOCATOR; else { return ARG_UNKNOWN; } } +static int get_allocator(const char *buf, const char *migrate_info) +{ + char *tmp, *first_line, *second_line; + int allocator = 0; + + if (strstr(migrate_info, "CMA")) + allocator |= ALLOCATOR_CMA; + if (strstr(migrate_info, "slab")) + allocator |= ALLOCATOR_SLAB; + tmp = strstr(buf, "__vmalloc_node_range"); + if (tmp) { + second_line = tmp; + while (*tmp != '\n') + tmp--; + tmp--; + while (*tmp != '\n') + tmp--; + first_line = ++tmp; + tmp = strstr(tmp, "alloc_pages"); + if (tmp) { + if (tmp && first_line <= tmp && tmp < second_line) + allocator |= ALLOCATOR_VMALLOC; + } + } + if (allocator == 0) + allocator = ALLOCATOR_OTHERS; + return allocator; +} + static bool match_num_list(int num, int *list, int list_size) { for (int i = 0; i < list_size; ++i) @@ -437,7 +491,7 @@ static bool is_need(char *buf) return true; } -static void add_list(char *buf, int len) +static void add_list(char *buf, int len, char *ext_buf) { if (list_size != 0 && len == list[list_size-1].len && @@ -471,6 +525,7 @@ static void add_list(char *buf, int len) list[list_size].stacktrace++; list[list_size].ts_nsec = get_ts_nsec(buf); list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + list[list_size].allocator = get_allocator(buf, ext_buf); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -496,12 +551,16 @@ static bool parse_cull_args(const char *arg_str) cull |= CULL_STACKTRACE; else if (arg_type == ARG_FREE) cull |= CULL_UNRELEASE; + else if (arg_type == ARG_ALLOCATOR) + cull |= CULL_ALLOCATOR; else { free_explode(args, size); return false; } } free_explode(args, size); + if (sc.size == 0) + set_single_cmp(compare_num, SORT_DESC); return true; } @@ -556,6 +615,8 @@ static bool parse_sort_args(const char *arg_str) sc.cmps[i] = compare_free_ts; else if (arg_type == ARG_TXT) sc.cmps[i] = compare_txt; + else if (arg_type == ARG_ALLOCATOR) + sc.cmps[i] = compare_allocator; else { free_explode(args, size); sc.size = 0; @@ -588,6 +649,19 @@ static int *parse_nums_list(char *arg_str, int *list_size) return list; } +static void print_allocator(FILE *out, int allocator) +{ + fprintf(out, "allocated by "); + if (allocator & ALLOCATOR_CMA) + fprintf(out, "CMA "); + if (allocator & ALLOCATOR_SLAB) + fprintf(out, "SLAB "); + if (allocator & ALLOCATOR_VMALLOC) + fprintf(out, "VMALLOC "); + if (allocator & ALLOCATOR_OTHERS) + fprintf(out, "OTHERS "); +} + #define BUF_SIZE (128 * 1024) static void usage(void) @@ -614,8 +688,8 @@ static void usage(void) int main(int argc, char **argv) { FILE *fin, *fout; - char *buf; - int ret, i, count; + char *buf, *ext_buf; + int i, count; struct stat st; int opt; struct option longopts[] = { @@ -724,16 +798,18 @@ int main(int argc, char **argv) list = malloc(max_size * sizeof(*list)); buf = malloc(BUF_SIZE); - if (!list || !buf) { + ext_buf = malloc(BUF_SIZE); + if (!list || !buf || !ext_buf) { fprintf(stderr, "Out of memory\n"); exit(1); } for ( ; ; ) { - ret = read_block(buf, BUF_SIZE, fin); - if (ret < 0) + int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin); + + if (buf_len < 0) break; - add_list(buf, ret); + add_list(buf, buf_len, ext_buf); } printf("loaded %d\n", list_size); @@ -757,9 +833,11 @@ int main(int argc, char **argv) qsort(list, count, sizeof(list[0]), compare_sort_condition); for (i = 0; i < count; i++) { - if (cull == 0) - fprintf(fout, "%d times, %d pages:\n%s\n", - list[i].num, list[i].page_num, list[i].txt); + if (cull == 0) { + fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num); + print_allocator(fout, list[i].allocator); + fprintf(fout, ":\n%s\n", list[i].txt); + } else { fprintf(fout, "%d times, %d pages", list[i].num, list[i].page_num); @@ -769,6 +847,10 @@ int main(int argc, char **argv) fprintf(fout, ", TGID %d", list[i].pid); if (cull & CULL_COMM || filter & FILTER_COMM) fprintf(fout, ", task_comm_name: %s", list[i].comm); + if (cull & CULL_ALLOCATOR) { + fprintf(fout, ", "); + print_allocator(fout, list[i].allocator); + } if (cull & CULL_UNRELEASE) fprintf(fout, " (%s)", list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); -- cgit v1.2.3 From c7c4ab859642830a14c45785ca7866659b65fc44 Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 28 Apr 2022 23:15:57 -0700 Subject: tools/vm/page_owner_sort.c: avoid repeated judgments I noticed a detail that needs to be adjusted. When judging whether a page is allocated by vmalloc, the value of the variable "tmp" was repeatedly judged, so the code was adjusted. This work is coauthored by Yinan Zhang, Jiajian Ye, Shenghong Han, Chongxi Zhao, Yuhong Feng and Yongqiang Liu. Link: https://lkml.kernel.org/r/20220414042744.13896-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Chongxi Zhao Cc: Haowen Bai Cc: Jiajian Ye Cc: Sean Anderson Cc: Shenghong Han Cc: Yinan Zhang Cc: Yongqiang Liu Cc: Yuhong Feng Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index fa2e4d2a9d68..c149427eb1c9 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -444,10 +444,8 @@ static int get_allocator(const char *buf, const char *migrate_info) tmp--; first_line = ++tmp; tmp = strstr(tmp, "alloc_pages"); - if (tmp) { - if (tmp && first_line <= tmp && tmp < second_line) - allocator |= ALLOCATOR_VMALLOC; - } + if (tmp && first_line <= tmp && tmp < second_line) + allocator |= ALLOCATOR_VMALLOC; } if (allocator == 0) allocator = ALLOCATOR_OTHERS; -- cgit v1.2.3 From 21f0dd88f23dc9dc46b781f8ec9acf975dca4e6e Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 28 Apr 2022 23:15:57 -0700 Subject: mm: rework calculation of bdi_min_ratio in bdi_set_min_ratio In function bdi_set_min_ratio, min_ratio is unsigned int, it will result underflow when setting min_ratio below bdi->min_ratio, it is confusing. Rework it, no functional change. Link: https://lkml.kernel.org/r/20220422095159.2858305-1-chenwandun@huawei.com Signed-off-by: Chen Wandun Cc: Peter Zijlstra Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/page-writeback.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e2da284e427..9f459f7f8f83 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,18 +650,25 @@ static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { + unsigned int delta; int ret = 0; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { - min_ratio -= bdi->min_ratio; - if (bdi_min_ratio + min_ratio < 100) { - bdi_min_ratio += min_ratio; - bdi->min_ratio += min_ratio; + if (min_ratio < bdi->min_ratio) { + delta = bdi->min_ratio - min_ratio; + bdi_min_ratio -= delta; + bdi->min_ratio = min_ratio; } else { - ret = -EINVAL; + delta = min_ratio - bdi->min_ratio; + if (bdi_min_ratio + delta < 100) { + bdi_min_ratio += delta; + bdi->min_ratio = min_ratio; + } else { + ret = -EINVAL; + } } } spin_unlock_bh(&bdi_lock); -- cgit v1.2.3 From 9096bbe951ddd3f9dc813e73b5bde6d5715d1cdb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:15:58 -0700 Subject: mm: shmem: make shmem_init return void The return value of shmem_init is never used. So we can make it return void now. [akpm@linux-foundation.org: remove `return;' from void-returning function, per Muchun Song] Link: https://lkml.kernel.org/r/20220328112707.22217-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Muchun Song Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 2 +- mm/shmem.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ab51d3cd39bd..3e915cc550bc 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -56,7 +56,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; -extern int shmem_init(void); +extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); diff --git a/mm/shmem.c b/mm/shmem.c index 4b2fea33158e..e504d734177b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3888,7 +3888,7 @@ static struct file_system_type shmem_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int __init shmem_init(void) +void __init shmem_init(void) { int error; @@ -3913,14 +3913,13 @@ int __init shmem_init(void) else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif - return 0; + return; out1: unregister_filesystem(&shmem_fs_type); out2: shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); - return error; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) @@ -3998,14 +3997,12 @@ static struct file_system_type shmem_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int __init shmem_init(void) +void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); - - return 0; } int shmem_unuse(unsigned int type) -- cgit v1.2.3 From d8f653386cb571b46e9ce7d9681d6dc4eae441d6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:15:58 -0700 Subject: mm/memcg: remove unneeded nr_scanned The local variable nr_scanned is unneeded as mem_cgroup_soft_reclaim always does *total_scanned += nr_scanned. So we can pass total_scanned directly to the mem_cgroup_soft_reclaim to simplify the code and save some cpu cycles of adding nr_scanned to total_scanned. Link: https://lkml.kernel.org/r/20220328114144.53389-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Reviewed-by: Roman Gushchin Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 598fece89e2b..09df048c5aee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3387,7 +3387,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, int loop = 0; struct mem_cgroup_tree_per_node *mctz; unsigned long excess; - unsigned long nr_scanned; if (order > 0) return 0; @@ -3415,11 +3414,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, if (!mz) break; - nr_scanned = 0; reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, - gfp_mask, &nr_scanned); + gfp_mask, total_scanned); nr_reclaimed += reclaimed; - *total_scanned += nr_scanned; spin_lock_irq(&mctz->lock); __mem_cgroup_remove_exceeded(mz, mctz); -- cgit v1.2.3 From 391e0efc15e9919cd74a5c29255441e1f643dcae Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:15:58 -0700 Subject: mm/memcg: mz already removed from rb_tree if not NULL When mz is not NULL, it means mz can either come from mem_cgroup_largest_soft_limit_node or __mem_cgroup_largest_soft_limit_node. And both of them have removed this node by __mem_cgroup_remove_exceeded(). Not necessary to call __mem_cgroup_remove_exceeded() again. [mhocko@suse.com: refine changelog] Link: https://lkml.kernel.org/r/20220314233030.12334-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09df048c5aee..5c3e2e3d27ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3418,7 +3418,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_mask, total_scanned); nr_reclaimed += reclaimed; spin_lock_irq(&mctz->lock); - __mem_cgroup_remove_exceeded(mz, mctz); /* * If we failed to reclaim anything from this memory cgroup -- cgit v1.2.3 From 41555dadbff8d2558d868110e47c24b747aad224 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:15:58 -0700 Subject: mm/memcg: set memcg after css verified and got reference Patch series "mm/memcg: some cleanup for mem_cgroup_iter()", v2. No functional change, try to make it more readable. This patch (of 3): Instead of resetting memcg when css is either not verified or not got reference, we can set it after these process. No functional change, just simplified the code a little. Link: https://lkml.kernel.org/r/20220330234719.18340-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20220330234719.18340-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c3e2e3d27ac..f06b5c249ecb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1065,15 +1065,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, * is provided by the caller, so we know it's alive * and kicking, and don't take an extra reference. */ - memcg = mem_cgroup_from_css(css); - - if (css == &root->css) - break; - - if (css_tryget(css)) + if (css == &root->css || css_tryget(css)) { + memcg = mem_cgroup_from_css(css); break; - - memcg = NULL; + } } if (reclaim) { -- cgit v1.2.3 From 89d8330ccf2ad4c0744e1d1b77ffe2deb1641e54 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:15:58 -0700 Subject: mm/memcg: set pos explicitly for reclaim and !reclaim During mem_cgroup_iter, there are two ways to get iteration position: reclaim vs non-reclaim mode. Let's do it explicitly for reclaim vs non-reclaim mode. Link: https://lkml.kernel.org/r/20220330234719.18340-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f06b5c249ecb..1b76d5698900 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1013,9 +1013,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (!root) root = root_mem_cgroup; - if (prev && !reclaim) - pos = prev; - rcu_read_lock(); if (reclaim) { @@ -1041,6 +1038,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, */ (void)cmpxchg(&iter->position, pos, NULL); } + } else if (prev) { + pos = prev; } if (pos) -- cgit v1.2.3 From a9320aae68a1cd3f41b9846e24504b09ffc3311e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:15:59 -0700 Subject: mm/memcg: move generation assignment and comparison together For each round-trip, we assign generation on first invocation and compare it on subsequent invocations. Let's move them together to make it more self-explaining. Also this reduce a check on prev. [hannes@cmpxchg.org: better comment to explain reclaim model] Link: https://lkml.kernel.org/r/20220330234719.18340-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b76d5698900..1ae0a2afe476 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1021,7 +1021,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = root->nodeinfo[reclaim->pgdat->node_id]; iter = &mz->iter; - if (prev && reclaim->generation != iter->generation) + /* + * On start, join the current reclaim iteration cycle. + * Exit when a concurrent walker completes it. + */ + if (!prev) + reclaim->generation = iter->generation; + else if (reclaim->generation != iter->generation) goto out_unlock; while (1) { @@ -1083,8 +1089,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (!memcg) iter->generation++; - else if (!prev) - reclaim->generation = iter->generation; } out_unlock: -- cgit v1.2.3 From c449d5599287742f11b0f2ae34b921dc0837d1c6 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:15:59 -0700 Subject: mm/memcg: non-hierarchical mode is deprecated After commit bef8620cd8e0 ("mm: memcg: deprecate the non-hierarchical mode"), we won't have a NULL parent except root_mem_cgroup. And this case is handled when (memcg == root). Link: https://lkml.kernel.org/r/20220403020833.26164-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ae0a2afe476..b2d5c6618c8a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6587,9 +6587,6 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, return; parent = parent_mem_cgroup(memcg); - /* No parent means a non-hierarchical mode on v1 memcg */ - if (!parent) - return; if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); -- cgit v1.2.3 From c85bcc912f4f404bf6eaf4b6bdb8480ef2c2faa1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 28 Apr 2022 23:15:59 -0700 Subject: kselftests: memcg: update the oom group leaf events test Patch series "mm: memcg kselftests fixes". This patch (of 4): Commit 9852ae3fe529 ("mm, memcg: consider subtrees in memory.events") made memory.events recursive: all events are propagated upwards by the tree. It was a change in semantics. It broke the oom group leaf events test: it assumes that after an OOM the oom_kill counter is zero on parent's level. Let's adjust the test: it should have similar expectations for the child and parent levels. The test passes after this fix. Link: https://lkml.kernel.org/r/20220415000133.3955987-2-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20220415000133.3955987-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: David Vernet Cc: Chris Down Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 36ccf2322e21..00b430e7f2a2 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1079,7 +1079,8 @@ cleanup: /* * This test disables swapping and tries to allocate anonymous memory * up to OOM with memory.group.oom set. Then it checks that all - * processes in the leaf (but not the parent) were killed. + * processes in the leaf were killed. It also checks that oom_events + * were propagated to the parent level. */ static int test_memcg_oom_group_leaf_events(const char *root) { @@ -1122,7 +1123,7 @@ static int test_memcg_oom_group_leaf_events(const char *root) if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) goto cleanup; - if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0) + if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0) goto cleanup; ret = KSFT_PASS; -- cgit v1.2.3 From be74553f250fb2154375b8e14e9f9b58aafd23b0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 28 Apr 2022 23:15:59 -0700 Subject: kselftests: memcg: speed up the memory.high test After commit 0e4b01df8659 ("mm, memcg: throttle allocators when failing reclaim over memory.high") allocating memory over memory.high became very time consuming. But it's exactly what the memory.high test from cgroup kselftests is doing: it tries to allocate 100M with 30M memory.high value. It takes forever to complete. In order to keep it passing (or failing) in a reasonable amount of time let's try to allocate only a little over 30M: 31M to be precise. With this change test_memcontrol finishes in a reasonable amount of time: $ time ./test_memcontrol ok 1 test_memcg_subtree_control ok 2 test_memcg_current ok 3 test_memcg_min ok 4 test_memcg_low ok 5 test_memcg_high ok 6 test_memcg_max ok 7 test_memcg_oom_events ok 8 test_memcg_swap_max ok 9 test_memcg_sock ok 10 test_memcg_oom_group_leaf_events ok 11 test_memcg_oom_group_parent_events ok 12 test_memcg_oom_group_score_events real 0m2.273s user 0m0.064s sys 0m0.739s Link: https://lkml.kernel.org/r/20220415000133.3955987-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: David Vernet Cc: Chris Down Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 00b430e7f2a2..9c1f19fe2e37 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -607,7 +607,7 @@ static int test_memcg_high(const char *root) if (cg_write(memcg, "memory.high", "30M")) goto cleanup; - if (cg_run(memcg, alloc_anon, (void *)MB(100))) + if (cg_run(memcg, alloc_anon, (void *)MB(31))) goto cleanup; if (!cg_run(memcg, alloc_pagecache_50M_check, NULL)) -- cgit v1.2.3 From 1bd1a4dd3e8ce8a23234a15c9dad4dcd257033fa Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 28 Apr 2022 23:15:59 -0700 Subject: MAINTAINERS: add corresponding kselftests to cgroup entry List cgroup kselftests in the cgroup MAINTAINERS entry. These are tests covering core, freezer and cgroup.kill functionality. Link: https://lkml.kernel.org/r/20220415000133.3955987-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: Chris Down Cc: David Vernet Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2647adf30569..66a7e5c661a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4995,6 +4995,7 @@ F: Documentation/admin-guide/cgroup-v1/ F: Documentation/admin-guide/cgroup-v2.rst F: include/linux/cgroup* F: kernel/cgroup/ +F: tools/testing/selftests/cgroup/ CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO) M: Tejun Heo -- cgit v1.2.3 From 9c946e3e7f579081a2dd6912a8a23baedef8c3f9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 28 Apr 2022 23:16:00 -0700 Subject: MAINTAINERS: add corresponding kselftests to memcg entry List memory control and kernel memory control kselftests in the memory resource controller entry. Link: https://lkml.kernel.org/r/20220415000133.3955987-5-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Chris Down Cc: David Vernet Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 66a7e5c661a5..78c57046fa93 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5029,6 +5029,8 @@ L: linux-mm@kvack.org S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +F: tools/testing/selftests/cgroup/test_kmem.c +F: tools/testing/selftests/cgroup/test_memcontrol.c CORETEMP HARDWARE MONITORING DRIVER M: Fenghua Yu -- cgit v1.2.3 From ef7a4ffc4c7f969d61e297c53cb2de1d6f012a11 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 28 Apr 2022 23:16:00 -0700 Subject: mm/memcontrol.c: make cgroup_memory_noswap static cgroup_memory_noswap is only used in mm/memcontrol.c, therefore just make it static, and remove export in include/linux/memcontrol.h Link: https://lkml.kernel.org/r/20220421124736.62180-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ---- mm/memcontrol.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 89b14729d59f..fe580cb96683 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -937,10 +937,6 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -#ifdef CONFIG_MEMCG_SWAP -extern bool cgroup_memory_noswap; -#endif - void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); void lock_page_memcg(struct page *page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2d5c6618c8a..5cf5e32de313 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -89,7 +89,7 @@ static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -bool cgroup_memory_noswap __ro_after_init; +static bool cgroup_memory_noswap __ro_after_init; #else #define cgroup_memory_noswap 1 #endif -- cgit v1.2.3 From 9707aff701e325e7c4660409c43392ff2fb36fad Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 28 Apr 2022 23:16:00 -0700 Subject: mm/memcontrol.c: remove unused private flag of memory.oom_control There is no use for the private value, __OOM_TYPE and OOM notifier OOM_CONTROL. Therefore remove them to make the code clean. Link: https://lkml.kernel.org/r/20220421122755.40899-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5cf5e32de313..c1f888313462 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -209,7 +209,6 @@ static struct move_charge_struct { enum res_type { _MEM, _MEMSWAP, - _OOM_TYPE, _KMEM, _TCP, }; @@ -217,8 +216,6 @@ enum res_type { #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -/* Used for OOM notifier */ -#define OOM_CONTROL (0) /* * Iteration constructs for visiting all cgroups (under a tree). If @@ -4887,7 +4884,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "oom_control", .seq_show = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, - .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, { .name = "pressure_level", -- cgit v1.2.3 From 98af39d52e336b2d7d7be67ac405f978d81f65b8 Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 28 Apr 2022 23:16:00 -0700 Subject: mm/vmalloc: fix a comment The sentence "but the mempolcy want to alloc memory by interleaving" should be rephrased with "but the mempolicy wants to alloc memory by interleaving" where "mempolicy" is a struct name. This work is coauthored by Yinan Zhang Jiajian Ye Shenghong Han Chongxi Zhao Yuhong Feng Yongqiang Liu Link: https://lkml.kernel.org/r/20220401064543.4447-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cadfbb5155ea..9d68ac51a19d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2895,7 +2895,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, /* memory allocation should consider mempolicy, we can't * wrongly use nearest node when nid == NUMA_NO_NODE, * otherwise memory may be allocated in only one node, - * but mempolcy want to alloc memory by interleaving. + * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, -- cgit v1.2.3 From 4fcdcc12915c70761ae6adf25e3a295a75a7431d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 28 Apr 2022 23:16:00 -0700 Subject: vmap(): don't allow invalid pages vmap() takes struct page *pages as one of arguments, and user may provide an invalid pointer which may lead to corrupted translation table. An example of such behaviour is erroneous usage of virt_to_page(): vaddr1 = dma_alloc_coherent() page = virt_to_page() // Wrong here ... vaddr2 = vmap(page) memset(vaddr2) // Faulting here virt_to_page() returns a wrong pointer if vaddr1 is not a linear kernel address. The problem is that vmap() populates pte with bad pfn successfully, and it's much harder to debug at memory access time. This case should be caught by DEBUG_VIRTUAL being that enabled, but it's not enabled in popular distros. Kernel already checks the pages against NULL. In the case mentioned above, however, the address is not NULL, and it's big enough so that the hardware generated Address Size Abort on arm64: [ 665.484101] Unhandled fault at 0xffff8000252cd000 [ 665.488807] Mem abort info: [ 665.491617] ESR = 0x96000043 [ 665.494675] EC = 0x25: DABT (current EL), IL = 32 bits [ 665.499985] SET = 0, FnV = 0 [ 665.503039] EA = 0, S1PTW = 0 [ 665.506167] Data abort info: [ 665.509047] ISV = 0, ISS = 0x00000043 [ 665.512882] CM = 0, WnR = 1 [ 665.515851] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000818cb000 [ 665.522550] [ffff8000252cd000] pgd=000000affcfff003, pud=000000affcffe003, pmd=0000008fad8c3003, pte=00688000a5217713 [ 665.533160] Internal error: level 3 address size fault: 96000043 [#1] SMP [ 665.539936] Modules linked in: [...] [ 665.616212] CPU: 178 PID: 13199 Comm: test Tainted: P OE 5.4.0-84-generic #94~18.04.1-Ubuntu [ 665.626806] Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.0.6 07/10/2018 [ 665.636618] pstate: 80400009 (Nzcv daif +PAN -UAO) [ 665.641407] pc : __memset+0x38/0x188 [ 665.645146] lr : test+0xcc/0x3f8 [ 665.650184] sp : ffff8000359bb840 [ 665.653486] x29: ffff8000359bb840 x28: 0000000000000000 [ 665.658785] x27: 0000000000000000 x26: 0000000000231000 [ 665.664083] x25: ffff00ae660f6110 x24: ffff00ae668cb800 [ 665.669382] x23: 0000000000000001 x22: ffff00af533e5000 [ 665.674680] x21: 0000000000001000 x20: 0000000000000000 [ 665.679978] x19: ffff00ae66950000 x18: ffffffffffffffff [ 665.685276] x17: 00000000588636a5 x16: 0000000000000013 [ 665.690574] x15: ffffffffffffffff x14: 000000000007ffff [ 665.695872] x13: 0000000080000000 x12: 0140000000000000 [ 665.701170] x11: 0000000000000041 x10: ffff8000652cd000 [ 665.706468] x9 : ffff8000252cf000 x8 : ffff8000252cd000 [ 665.711767] x7 : 0303030303030303 x6 : 0000000000001000 [ 665.717065] x5 : ffff8000252cd000 x4 : 0000000000000000 [ 665.722363] x3 : ffff8000252cdfff x2 : 0000000000000001 [ 665.727661] x1 : 0000000000000003 x0 : ffff8000252cd000 [ 665.732960] Call trace: [ 665.735395] __memset+0x38/0x188 [...] Interestingly, this abort happens even if copy_from_kernel_nofault() is used, which is quite inconvenient for debugging purposes. This patch adds a pfn_valid() check into vmap() path, so that invalid mapping will not be created; WARN_ON() is used to let client code know that something goes wrong, and it's not a regular EINVAL situation. Link: https://lkml.kernel.org/r/20220422220410.1308706-1-yury.norov@gmail.com Signed-off-by: Yury Norov (NVIDIA) Suggested-by: Matthew Wilcox (Oracle) Cc: Alexey Klimov Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Ding Tianhong Cc: Mark Rutland Cc: Nicholas Piggin Cc: Robin Murphy Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9d68ac51a19d..c3af9b28e54f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -478,6 +478,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); -- cgit v1.2.3 From 8d98e42fb20c25e8efdab4cc1ac46d52ba964aca Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Thu, 28 Apr 2022 23:16:01 -0700 Subject: Documentation/sysctl: document page_lock_unfairness commit 5ef64cc8987a ("mm: allow a controlled amount of unfairness in the page lock") introduced a new systctl but no accompanying documentation. Add a simple entry to the documentation. Link: https://lkml.kernel.org/r/20220325164437.120246-1-jsavitz@redhat.com Signed-off-by: Joel Savitz Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Mel Gorman Cc: Dave Hansen Cc: Suren Baghdasaryan Cc: Mike Rapoport Cc: "zhangyi (F)" Cc: Charan Teja Reddy Cc: Linus Torvalds Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/vm.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index f4804ce37c58..747e325ebcd0 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -62,6 +62,7 @@ Currently, these files are in /proc/sys/vm: - overcommit_memory - overcommit_ratio - page-cluster +- page_lock_unfairness - panic_on_oom - percpu_pagelist_high_fraction - stat_interval @@ -754,6 +755,14 @@ extra faults and I/O delays for following faults if they would have been part of that consecutive pages readahead would have brought in. +page_lock_unfairness +==================== + +This value determines the number of times that the page lock can be +stolen from under a waiter. After the lock is stolen the number of times +specified in this file (default is 5), the "fair lock handoff" semantics +will apply, and the waiter will only be awakened if the lock can be taken. + panic_on_oom ============ -- cgit v1.2.3 From 379313241e77abc18258da1afd49d111c72c5a3d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:16:01 -0700 Subject: mm/page_alloc: adding same penalty is enough to get round-robin order To make node order in round-robin in the same distance group, we add a penalty to the first node we got in each round. To get a round-robin order in the same distance group, we don't need to decrease the penalty since: * find_next_best_node() always iterates node in the same order * distance matters more then penalty in find_next_best_node() * in nodes with the same distance, the first one would be picked up So it is fine to increase same penalty when we get the first node in the same distance group. Since we just increase a constance of 1 to node penalty, it is not necessary to multiply MAX_NODE_LOAD for preference. [richard.weiyang@gmail.com: remove remove MAX_NODE_LOAD, per Vlastimil] Link: https://lkml.kernel.org/r/20220412001319.7462-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20220123013537.20491-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Signed-off-by: Wei Yang Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Krupa Ramakrishnan Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e42038382c1..66a5198157ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6171,7 +6171,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, } -#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -6218,7 +6217,7 @@ int find_next_best_node(int node, nodemask_t *used_node_mask) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ - val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val *= MAX_NUMNODES; val += node_load[n]; if (val < min_val) { @@ -6284,13 +6283,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; - int node, load, nr_nodes = 0; + int node, nr_nodes = 0; nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = nr_online_nodes; prev_node = local_node; memset(node_order, 0, sizeof(node_order)); @@ -6302,11 +6300,10 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] += 1; node_order[nr_nodes++] = node; prev_node = node; - load--; } build_zonelists_in_node_order(pgdat, node_order, nr_nodes); -- cgit v1.2.3 From bb0e28eb5bc2b3a22e47861ca59bccca566023e8 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 28 Apr 2022 23:16:01 -0700 Subject: mm: page_alloc: simplify pageblock migratetype check in __free_one_page() Move pageblock migratetype check code in the while loop to simplify the logic. It also saves redundant buddy page checking code. Link: https://lkml.kernel.org/r/20220401230804.1658207-1-zi.yan@sent.com Link: https://lore.kernel.org/linux-mm/27ff69f9-60c5-9e59-feb2-295250077551@suse.cz/ Signed-off-by: Zi Yan Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Rapoport Cc: Oscar Salvador Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 46 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 66a5198157ad..26dea6cd3f53 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1053,7 +1053,6 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned int max_order = pageblock_order; unsigned long buddy_pfn; unsigned long combined_pfn; struct page *buddy; @@ -1069,8 +1068,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); -continue_merging: - while (order < max_order) { + while (order < MAX_ORDER - 1) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -1081,6 +1079,22 @@ continue_merging: if (!page_is_buddy(page, buddy, order)) goto done_merging; + + if (unlikely(order >= pageblock_order)) { + /* + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. + */ + int buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; + } + /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -1094,32 +1108,6 @@ continue_merging: pfn = combined_pfn; order++; } - if (order < MAX_ORDER - 1) { - /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on pageblock - * without fallbacks and normal pageblock. Without this, - * pageblock isolation could cause incorrect freepage or CMA - * accounting or HIGHATOMIC accounting. - * - * We don't want to hit this code for the more frequent - * low-order merging. - */ - int buddy_mt; - - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - - if (!page_is_buddy(page, buddy, order)) - goto done_merging; - buddy_mt = get_pageblock_migratetype(buddy); - - if (migratetype != buddy_mt - && (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt))) - goto done_merging; - max_order = order + 1; - goto continue_merging; - } done_merging: set_buddy_order(page, order); -- cgit v1.2.3 From 8170ac4700d26f65a9a4ebc8ae488539158dc5f7 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 28 Apr 2022 23:16:01 -0700 Subject: mm: wrap __find_buddy_pfn() with a necessary buddy page validation Whenever the buddy of a page is found from __find_buddy_pfn(), page_is_buddy() should be used to check its validity. Add a helper function find_buddy_page_pfn() to find the buddy page and do the check together. [ziy@nvidia.com: updates per David] Link: https://lkml.kernel.org/r/20220401230804.1658207-2-zi.yan@sent.com Link: https://lore.kernel.org/linux-mm/CAHk-=wji_AmYygZMTsPMdJ7XksMt7kOur8oDfDdniBRMjm4VkQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/7236E7CA-B5F1-4C04-AB85-E86FA3E9A54B@nvidia.com Suggested-by: Linus Torvalds Signed-off-by: Zi Yan Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Steven Rostedt (Google) Cc: Mel Gorman Cc: Mike Rapoport Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/internal.h | 117 ++++++++++++++++++++++++++++++++++++++++------------ mm/page_alloc.c | 52 ++++------------------- mm/page_isolation.c | 9 ++-- 3 files changed, 101 insertions(+), 77 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index cf16280ce132..6293468fd7b7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -211,6 +211,67 @@ struct alloc_context { bool spread_dirty_pages; }; +/* + * This function returns the order of a free page in the buddy system. In + * general, page_zone(page)->lock must be held by the caller to prevent the + * page from being allocated in parallel and returning garbage as the order. + * If a caller does not hold page_zone(page)->lock, it must guarantee that the + * page cannot be allocated or merged in parallel. Alternatively, it must + * handle invalid values gracefully, and use buddy_order_unsafe() below. + */ +static inline unsigned int buddy_order(struct page *page) +{ + /* PageBuddy() must be checked by the caller */ + return page_private(page); +} + +/* + * Like buddy_order(), but for callers who cannot afford to hold the zone lock. + * PageBuddy() should be checked first by the caller to minimize race window, + * and invalid values must be handled gracefully. + * + * READ_ONCE is used so that if the caller assigns the result into a local + * variable and e.g. tests it for valid range before using, the compiler cannot + * decide to remove the variable and inline the page_private(page) multiple + * times, potentially observing different values in the tests and the actual + * use of the result. + */ +#define buddy_order_unsafe(page) READ_ONCE(page_private(page)) + +/* + * This function checks whether a page is free && is the buddy + * we can coalesce a page and its buddy if + * (a) the buddy is not in a hole (check before calling!) && + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. + * + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. + * + * For recording page's order, we use page_private(page). + */ +static inline bool page_is_buddy(struct page *page, struct page *buddy, + unsigned int order) +{ + if (!page_is_guard(buddy) && !PageBuddy(buddy)) + return false; + + if (buddy_order(buddy) != order) + return false; + + /* + * zone check is done late to avoid uselessly calculating + * zone/node ids for pages that could never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return false; + + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + return true; +} + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -234,6 +295,35 @@ __find_buddy_pfn(unsigned long page_pfn, unsigned int order) return page_pfn ^ (1 << order); } +/* + * Find the buddy of @page and validate it. + * @page: The input page + * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the + * function is used in the performance-critical __free_one_page(). + * @order: The order of the page + * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to + * page_to_pfn(). + * + * The found buddy can be a non PageBuddy, out of @page's zone, or its order is + * not the same as @page. The validation is necessary before use it. + * + * Return: the found buddy page or NULL if not found. + */ +static inline struct page *find_buddy_page_pfn(struct page *page, + unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) +{ + unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); + struct page *buddy; + + buddy = page + (__buddy_pfn - pfn); + if (buddy_pfn) + *buddy_pfn = __buddy_pfn; + + if (page_is_buddy(page, buddy, order)) + return buddy; + return NULL; +} + extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); @@ -336,33 +426,6 @@ isolate_migratepages_range(struct compact_control *cc, int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); -/* - * This function returns the order of a free page in the buddy system. In - * general, page_zone(page)->lock must be held by the caller to prevent the - * page from being allocated in parallel and returning garbage as the order. - * If a caller does not hold page_zone(page)->lock, it must guarantee that the - * page cannot be allocated or merged in parallel. Alternatively, it must - * handle invalid values gracefully, and use buddy_order_unsafe() below. - */ -static inline unsigned int buddy_order(struct page *page) -{ - /* PageBuddy() must be checked by the caller */ - return page_private(page); -} - -/* - * Like buddy_order(), but for callers who cannot afford to hold the zone lock. - * PageBuddy() should be checked first by the caller to minimize race window, - * and invalid values must be handled gracefully. - * - * READ_ONCE is used so that if the caller assigns the result into a local - * variable and e.g. tests it for valid range before using, the compiler cannot - * decide to remove the variable and inline the page_private(page) multiple - * times, potentially observing different values in the tests and the actual - * use of the result. - */ -#define buddy_order_unsafe(page) READ_ONCE(page_private(page)) - /* * These three helpers classifies VMAs for virtual memory accounting. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26dea6cd3f53..8de8d7a5c3e3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -867,40 +867,6 @@ static inline void set_buddy_order(struct page *page, unsigned int order) __SetPageBuddy(page); } -/* - * This function checks whether a page is free && is the buddy - * we can coalesce a page and its buddy if - * (a) the buddy is not in a hole (check before calling!) && - * (b) the buddy is in the buddy system && - * (c) a page and its buddy have the same order && - * (d) a page and its buddy are in the same zone. - * - * For recording whether a page is in the buddy system, we set PageBuddy. - * Setting, clearing, and testing PageBuddy is serialized by zone->lock. - * - * For recording page's order, we use page_private(page). - */ -static inline bool page_is_buddy(struct page *page, struct page *buddy, - unsigned int order) -{ - if (!page_is_guard(buddy) && !PageBuddy(buddy)) - return false; - - if (buddy_order(buddy) != order) - return false; - - /* - * zone check is done late to avoid uselessly calculating - * zone/node ids for pages that could never merge. - */ - if (page_zone_id(page) != page_zone_id(buddy)) - return false; - - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - - return true; -} - #ifdef CONFIG_COMPACTION static inline struct capture_control *task_capc(struct zone *zone) { @@ -1009,18 +975,17 @@ static inline bool buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, struct page *page, unsigned int order) { - struct page *higher_page, *higher_buddy; - unsigned long combined_pfn; + unsigned long higher_page_pfn; + struct page *higher_page; if (order >= MAX_ORDER - 2) return false; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); + higher_page_pfn = buddy_pfn & pfn; + higher_page = page + (higher_page_pfn - pfn); - return page_is_buddy(higher_page, higher_buddy, order + 1); + return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, + NULL) != NULL; } /* @@ -1074,10 +1039,9 @@ static inline void __free_one_page(struct page *page, migratetype); return; } - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - if (!page_is_buddy(page, buddy, order)) + buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); + if (!buddy) goto done_merging; if (unlikely(order >= pageblock_order)) { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f67c4c70f17f..ff0ea6308299 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -70,7 +70,6 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) unsigned long flags, nr_pages; bool isolated_page = false; unsigned int order; - unsigned long pfn, buddy_pfn; struct page *buddy; zone = page_zone(page); @@ -89,11 +88,9 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (PageBuddy(page)) { order = buddy_order(page); if (order >= pageblock_order && order < MAX_ORDER - 1) { - pfn = page_to_pfn(page); - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - - if (!is_migrate_isolate_page(buddy)) { + buddy = find_buddy_page_pfn(page, page_to_pfn(page), + order, NULL); + if (buddy && !is_migrate_isolate_page(buddy)) { isolated_page = !!__isolate_free_page(page, order); /* * Isolating a free page in an isolated pageblock -- cgit v1.2.3 From f142e70750a1ea36ba60fb4f24bc37713e921f73 Mon Sep 17 00:00:00 2001 From: liqiong Date: Thu, 28 Apr 2022 23:16:01 -0700 Subject: mm/memory-failure.c: remove unnecessary (void*) conversions No need cast (void*) to (struct hwp_walk*). Link: https://lkml.kernel.org/r/20220322142826.25939-1-liqiong@nfschina.com Signed-off-by: liqiong Acked-by: Naoya Horiguchi Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27760c19bad7..837348a2d669 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -622,7 +622,7 @@ static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + struct hwp_walk *hwp = walk->private; int ret = 0; pte_t *ptep, *mapped_pte; spinlock_t *ptl; @@ -656,7 +656,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + struct hwp_walk *hwp = walk->private; pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(walk->vma); -- cgit v1.2.3 From f361e2462e8cccdd9231aa3274690705a2ea35a2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 28 Apr 2022 23:16:02 -0700 Subject: mm/hwpoison: put page in already hwpoisoned case with MF_COUNT_INCREASED In already hwpoisoned case, memory_failure() is supposed to return with releasing the page refcount taken for error handling. But currently the refcount is not released when called with MF_COUNT_INCREASED, which makes page refcount inconsistent. This should be rare and non-critical, but it might be inconvenient in testing (unpoison doesn't work). Link: https://lkml.kernel.org/r/20220408135323.1559401-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Suggested-by: Miaohe Lin Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Dan Carpenter Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 837348a2d669..44510efd0829 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1811,6 +1811,8 @@ try_again: res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); + if (flags & MF_COUNT_INCREASED) + put_page(p); goto unlock_mutex; } -- cgit v1.2.3 From 2ba2b008a8bf5fd268a43d03ba79e0ad464d6836 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 28 Apr 2022 23:16:02 -0700 Subject: Revert "mm/memory-failure.c: fix race with changing page compound again" Reverts commit 888af2701db7 ("mm/memory-failure.c: fix race with changing page compound again") because now we fetch the page refcount under hugetlb_lock in try_memory_failure_hugetlb() so that the race check is no longer necessary. Link: https://lkml.kernel.org/r/20220408135323.1559401-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Suggested-by: Miaohe Lin Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Miaohe Lin Cc: Yang Shi Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - include/ras/ras_event.h | 1 - mm/memory-failure.c | 11 ----------- 3 files changed, 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9f44254af8ce..d446e834a3e5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3251,7 +3251,6 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, - MF_MSG_DIFFERENT_PAGE_SIZE, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 1e694fd239b9..d0337a41141c 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -374,7 +374,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ - EM ( MF_MSG_DIFFERENT_PAGE_SIZE, "different page size" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44510efd0829..8af37f76e7ba 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -733,7 +733,6 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_DIFFERENT_PAGE_SIZE] = "different page size", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1605,16 +1604,6 @@ retry: return res == MF_RECOVERED ? 0 : -EBUSY; } - /* - * The page could have changed compound pages due to race window. - * If this happens just bail out. - */ - if (!PageHuge(p) || compound_head(p) != head) { - action_result(pfn, MF_MSG_DIFFERENT_PAGE_SIZE, MF_IGNORED); - res = -EBUSY; - goto out; - } - page_flags = head->flags; /* -- cgit v1.2.3 From 3f871370686ddf3c72207321eef8f6672ae957e4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:02 -0700 Subject: mm/memory-failure.c: minor cleanup for HWPoisonHandlable Patch series "A few fixup and cleanup patches for memory failure", v2. This series contains a patch to clean up the HWPoisonHandlable and another one to dissolve truncated hugetlb page. More details can be found in the respective changelogs. This patch (of 2): The local variable movable can be removed by returning true directly. Also fix typo 'mirgate'. No functional change intended. Link: https://lkml.kernel.org/r/20220414114941.11223-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220414114941.11223-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8af37f76e7ba..48815e1af645 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1178,13 +1178,11 @@ void ClearPageHWPoisonTakenOff(struct page *page) */ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { - bool movable = false; - - /* Soft offline could mirgate non-LRU movable pages */ + /* Soft offline could migrate non-LRU movable pages */ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) - movable = true; + return true; - return movable || PageLRU(page) || is_free_buddy_page(page); + return PageLRU(page) || is_free_buddy_page(page); } static int __get_hwpoison_page(struct page *page, unsigned long flags) -- cgit v1.2.3 From ef526b17bc3399b8df25d574aa11fc36f89da80a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:02 -0700 Subject: mm/memory-failure.c: dissolve truncated hugetlb page If me_huge_page meets a truncated but not yet freed hugepage, it won't be dissolved even if we hold the last refcnt. It's because the hugepage has NULL page_mapping while it's not anonymous hugepage too. Thus we lose the last chance to dissolve it into buddy to save healthy subpages. Remove PageAnon check to handle these hugepages too. Link: https://lkml.kernel.org/r/20220414114941.11223-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Cc: David Hildenbrand Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 48815e1af645..e01a2da803e2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1040,12 +1040,11 @@ static int me_huge_page(struct page_state *ps, struct page *p) res = MF_FAILED; unlock_page(hpage); /* - * migration entry prevents later access on error anonymous - * hugepage, so we can free and dissolve it into buddy to - * save healthy subpages. + * migration entry prevents later access on error hugepage, + * so we can free and dissolve it into buddy to save healthy + * subpages. */ - if (PageAnon(hpage)) - put_page(hpage); + put_page(hpage); if (__page_handle_poison(p)) { page_ref_inc(p); res = MF_RECOVERED; -- cgit v1.2.3 From b283d983a7a6ffe3939ff26f06d151331a7c1071 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 28 Apr 2022 23:16:02 -0700 Subject: mm, hugetlb, hwpoison: separate branch for free and in-use hugepage We know that HPageFreed pages should have page refcount 0, so get_page_unless_zero() always fails and returns 0. So explicitly separate the branch based on page state for minor optimization and better readability. Link: https://lkml.kernel.org/r/20220415041848.GA3034499@ik1-406-35019.vs.sakura.ne.jp Signed-off-by: Naoya Horiguchi Suggested-by: Mike Kravetz Suggested-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 +++- mm/memory-failure.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3fc721789743..f8e048b939c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6776,7 +6776,9 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) spin_lock_irq(&hugetlb_lock); if (PageHeadHuge(page)) { *hugetlb = true; - if (HPageFreed(page) || HPageMigratable(page)) + if (HPageFreed(page)) + ret = 0; + else if (HPageMigratable(page)) ret = get_page_unless_zero(page); else ret = -EBUSY; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e01a2da803e2..9711c0eb70b0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1517,7 +1517,9 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) if (flags & MF_COUNT_INCREASED) { ret = 1; count_increased = true; - } else if (HPageFreed(head) || HPageMigratable(head)) { + } else if (HPageFreed(head)) { + ret = 0; + } else if (HPageMigratable(head)) { ret = get_page_unless_zero(head); if (ret) count_increased = true; -- cgit v1.2.3 From 84448c8ecd9a130e8cddef5c585446c5520e774b Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: hugetlb: remove use of list iterator variable after loop In preparation to limit the scope of the list iterator to the list traversal loop, use a dedicated pointer to iterate through the list [1]. Before hugetlb_resv_map_add() was expecting a file_region struct, but in case the list iterator in add_reservation_in_range() did not exit early, the variable passed in, is not actually a valid structure. In such a case 'rg' is computed on the head element of the list and represents an out-of-bounds pointer. This still remains safe *iff* you only use the link member (as it is done in hugetlb_resv_map_add()). To avoid the type-confusion altogether and limit the list iterator to the loop, only a list_head pointer is kept to pass to hugetlb_resv_map_add(). Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220331224323.903842-1-jakobkoschel@gmail.com Signed-off-by: Jakob Koschel Cc: Mike Kravetz Cc: Mike Rapoport Cc: "Brian Johannesmeyer" Cc: Cristiano Giuffrida Cc: "Bos, H.J." Cc: Jakob Koschel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8e048b939c7..34efd507a68d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -370,7 +370,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) } static inline long -hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, +hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, long to, struct hstate *h, struct hugetlb_cgroup *cg, long *regions_needed) { @@ -379,7 +379,7 @@ hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, if (!regions_needed) { nrg = get_file_region_entry_from_cache(map, from, to); record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); - list_add(&nrg->link, rg->link.prev); + list_add(&nrg->link, rg); coalesce_file_region(map, nrg); } else *regions_needed += 1; @@ -402,47 +402,52 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; - struct file_region *rg = NULL, *trg = NULL; + struct file_region *iter, *trg = NULL; + struct list_head *rg = NULL; if (regions_needed) *regions_needed = 0; /* In this loop, we essentially handle an entry for the range - * [last_accounted_offset, rg->from), at every iteration, with some + * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking. */ - list_for_each_entry_safe(rg, trg, head, link) { + list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ - if (rg->from < f) { + if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset. */ - if (rg->to > last_accounted_offset) - last_accounted_offset = rg->to; + if (iter->to > last_accounted_offset) + last_accounted_offset = iter->to; continue; } /* When we find a region that starts beyond our range, we've * finished. */ - if (rg->from >= t) + if (iter->from >= t) { + rg = iter->link.prev; break; + } - /* Add an entry for last_accounted_offset -> rg->from, and + /* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset. */ - if (rg->from > last_accounted_offset) - add += hugetlb_resv_map_add(resv, rg, + if (iter->from > last_accounted_offset) + add += hugetlb_resv_map_add(resv, iter->link.prev, last_accounted_offset, - rg->from, h, h_cg, + iter->from, h, h_cg, regions_needed); - last_accounted_offset = rg->to; + last_accounted_offset = iter->to; } /* Handle the case where our range extends beyond * last_accounted_offset. */ + if (!rg) + rg = head->prev; if (last_accounted_offset < t) add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); -- cgit v1.2.3 From 2e4ec02bbcc05b8905d65c763ebde6bc85508e90 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: mm: hugetlb_vmemmap: introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64, however, the infrastructure of this feature is already there, we could easily enable it for other architectures. Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other architectures to be easily enabled. Just select this config if they want to enable this feature. Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Suggested-by: Andrew Morton Reviewed-by: Barry Song Tested-by: Barry Song Reviewed-by: Anshuman Khandual Cc: Bodeddula Balasubramaniam Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Fam Zheng Cc: James Morse Cc: Mark Rutland Cc: Mike Kravetz Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + fs/Kconfig | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..ae4e1a79a1e2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,6 +121,7 @@ config X86 select ARCH_WANTS_NO_INSTR select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH diff --git a/fs/Kconfig b/fs/Kconfig index 30b751c7f11a..8f6ab1cf5115 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -245,9 +245,17 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + bool + config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON -- cgit v1.2.3 From 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP for arm64 The feature of minimizing overhead of struct page associated with each HugeTLB page aims to free its vmemmap pages (used as struct page) to save memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type). In short, when a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. More implementations and details can be found here [1]. The infrastructure of freeing vmemmap pages associated with each HugeTLB page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for arm64, the only thing to be fixed is flush_dcache_page() . flush_dcache_page() need to be adapted to operate on the head page's flags since the tail vmemmap pages are mapped with read-only after the feature is enabled (clear operation is not permitted). There was some discussions about this in the thread [2], but there was no conclusion in the end. And I copied the concern proposed by Anshuman to here and explain why those concern is superfluous. It is safe to enable it for x86_64 as well as arm64. 1st concern: ''' But what happens when a hot remove section's vmemmap area (which is being teared down) is nearby another vmemmap area which is either created or being destroyed for HugeTLB alloc/free purpose. As you mentioned HugeTLB pages inside the hot remove section might be safe. But what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ? Massive HugeTLB alloc /use/free test cycle using memory just adjacent to a memory hotplug area, which is always added and removed periodically, should be able to expose this problem. ''' Answer: At the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is no race between memory hot remove and free_huge_page_vmemmap(). Therefore, HugeTLB pages inside the hot remove section is safe. Let's talk your question "what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ?", the question is not established. The minimal granularity size of hotplug memory 128MB (on arm64, 4k base page), any HugeTLB smaller than 128MB is within a section, then, there is no share PTE page tables between HugeTLB in this section and ones in other sections and a HugeTLB page could not cross two sections. In this case, the section cannot be freed. Any HugeTLB bigger than 128MB (section size) whose vmemmap pages is an integer multiple of 2MB (PMD-mapped). As long as: 1) HugeTLBs are naturally aligned, power-of-two sizes 2) The HugeTLB size >= the section size 3) The HugeTLB size >= the vmemmap leaf mapping size Then a HugeTLB will not share any leaf page table entries with *anything else*, but will share intermediate entries. In this case, at the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is also no race between memory hot remove and free_huge_page_vmemmap(). 2nd concern: ''' differently, not sure if ptdump would require any synchronization. Dumping an wrong value is probably okay but crashing because a page table entry is being freed after ptdump acquired the pointer is bad. On arm64, ptdump() is protected against hotremove via [get|put]_online_mems(). ''' Answer: The ptdump should be fine since vmemmap_remap_free() only exchanges PTEs or splits the PMD entry (which means allocating a PTE page table). Both operations do not free any page tables (PTE), so ptdump cannot run into a UAF on any page tables. The worst case is just dumping an wrong value. [1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/ [2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/ [songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()] Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Tested-by: Barry Song Cc: Will Deacon Cc: David Hildenbrand Cc: Bodeddula Balasubramaniam Cc: Oscar Salvador Cc: Mike Kravetz Cc: David Rientjes Cc: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Xiongchun Duan Cc: Fam Zheng Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/mm/flush.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 20ea89d9ac2f..2d7399ab0491 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -96,6 +96,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_NO_INSTR select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index a06c6ac770d4..840d998879ad 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -75,6 +75,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_page(struct page *page) { + /* + * Only the head page's flags of HugeTLB can be cleared since the tail + * vmemmap pages associated with each HugeTLB page are mapped with + * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * details can refer to vmemmap_remap_pte()). Although + * __sync_icache_dcache() only set PG_dcache_clean flag on the head + * page struct, there is more than one page struct with PG_dcache_clean + * associated with the HugeTLB page since the head vmemmap page frame + * is reused (more details can refer to the comments above + * page_fixed_fake_head()). + */ + if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } -- cgit v1.2.3 From 36c26128b8983d9af58266c36f23209064e22108 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: mm/vmscan: reclaim only affects managed_zones As mentioned in commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") , reclaim only affects managed_zones. Let's adjust the code and comment accordingly. Link: https://lkml.kernel.org/r/20220327024101.10378-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: "Huang, Ying" Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1678802e03e7..cd9f222a3e78 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1031,7 +1031,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; reclaimable += zone_reclaimable_pages(zone); @@ -3912,7 +3912,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) } /* - * If a node has no populated zone within highest_zoneidx, it does not + * If a node has no managed zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ -- cgit v1.2.3 From bc53008eea55330f485c956338d3c59f96c70c08 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: mm/vmscan: make sure wakeup_kswapd with managed zone wakeup_kswapd() only wake up kswapd when the zone is managed. For two callers of wakeup_kswapd(), they are node perspective. * wake_all_kswapds * numamigrate_isolate_page If we picked up a !managed zone, this is not we expected. This patch makes sure we pick up a managed zone for wakeup_kswapd(). And it also use managed_zone in migrate_balanced_pgdat() to get the proper zone. [richard.weiyang@gmail.com: adjust the usage in migrate_balanced_pgdat()] Link: https://lkml.kernel.org/r/20220329010901.1654-2-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20220327024101.10378-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: "Huang, Ying" Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/migrate.c | 6 +++--- mm/page_alloc.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6c31ee1e1c9b..d298544940b8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1969,7 +1969,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA - * pages. Currently it only checks the watermarks which crude + * pages. Currently it only checks the watermarks which is crude. */ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, unsigned long nr_migrate_pages) @@ -1979,7 +1979,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; /* Avoid waking kswapd by allocating pages_to_migrate pages. */ @@ -2032,7 +2032,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) return 0; for (z = pgdat->nr_zones - 1; z >= 0; z--) { - if (populated_zone(pgdat->node_zones + z)) + if (managed_zone(pgdat->node_zones + z)) break; } wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8de8d7a5c3e3..d2588d6226e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4629,6 +4629,8 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { + if (!managed_zone(zone)) + continue; if (last_pgdat != zone->zone_pgdat) wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; -- cgit v1.2.3 From 8b3a899abe15e68b8b82ff96c2f4e8c9a37874a0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: sc->reclaim_idx must be a valid zone index lruvec_lru_size() is only used in get_scan_count(), so the only possible zone_idx is sc->reclaim_idx. Since sc->reclaim_idx is ensured to be a valid zone idex, we can remove the extra check for zone iteration. Link: https://lkml.kernel.org/r/20220317234624.23358-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cd9f222a3e78..f6570f334a05 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -587,7 +587,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use - * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -595,7 +595,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, unsigned long size = 0; int zid; - for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid <= zone_idx; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; if (!managed_zone(zone)) -- cgit v1.2.3 From 02e458d8d04ed952a0451c094e6498c6829c28e8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: remove obsolete comment in get_scan_count Since commit 1431d4d11abb ("mm: base LRU balancing on an explicit cost model"), the relative value of each set of LRU lists is based on cost model instead of rotated/scanned ratio. Cleanup the relevant comment. Link: https://lkml.kernel.org/r/20220409030245.61211-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f6570f334a05..0e2f253949fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2646,9 +2646,7 @@ enum scan_balance { /* * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined - * by looking at the fraction of the pages scanned we did rotate back - * onto the active list instead of evict. + * scanned. * * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan -- cgit v1.2.3 From 5829f7dbae4158f9c946fac42760de848a8f1695 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: fix comment for current_may_throttle Since commit 6d6435811c19 ("remove bdi_congested() and wb_congested() and related functions"), there is no congested backing device check anymore. Correct the comment accordingly. [akpm@linux-foundation.org: tweak grammar] Link: https://lkml.kernel.org/r/20220414120202.30082-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e2f253949fe..c5bfbb74fde1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2297,10 +2297,9 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, } /* - * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. - * In that case we should only throttle if the backing device it is - * writing to is congested. In other cases it is safe to throttle. + * If a kernel thread (such as nfsd for loop-back mounts) services a backing + * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case + * we should not throttle. Otherwise it is safe to do so. */ static int current_may_throttle(void) { -- cgit v1.2.3 From b2cb6826b6df2bdf91ae4406fd2ef97da7a9cd35 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: fix comment for isolate_lru_pages Since commit 791b48b64232 ("mm: vmscan: scan until it finds eligible pages"), splicing any skipped pages to the tail of the LRU list won't put the system at risk of premature OOM but will waste lots of cpu cycles. Correct the comment accordingly. Link: https://lkml.kernel.org/r/20220416025231.8082-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c5bfbb74fde1..6e255e2867d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2117,8 +2117,8 @@ move: * Splice any skipped pages to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX - * scanning would soon rescan the same pages to skip and put the - * system at risk of premature OOM. + * scanning would soon rescan the same pages to skip and waste lots + * of cpu cycles. */ if (!list_empty(&pages_skipped)) { int zid; -- cgit v1.2.3 From c310e06cc4e44b4ec4bc02f0494a7f55cc36c1be Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: fs/proc/task_mmu.c: remove redundant page validation of pte_page pte_page() always returns a valid page, so remove the redundant page validation, as we did in many other places. Link: https://lkml.kernel.org/r/20220316025947.328276-1-xianting.tian@linux.alibaba.com Signed-off-by: Xianting Tian Reviewed-by: Chaitanya Kulkarni Cc: Alexey Dobriyan Cc: Yang Shi Cc: Sasha Levin Cc: Miaohe Lin Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f46060eb91b5..a843d13e2e1a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1873,8 +1873,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, return 0; page = pte_page(huge_pte); - if (!page) - return 0; md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); -- cgit v1.2.3 From dc3a1f3024b3222e21176249eea38ccb5d562218 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:05 -0700 Subject: mm/z3fold: declare z3fold_mount with __init Patch series "A few cleanup patches for z3fold", v2. This series contains a few patches to simplify the code, remove unneeded code, fix obsolete comment and so on. More details can be found in the respective changelogs. This patch (of 8): z3fold_mount is only called during init. So we should declare it with __init. Link: https://lkml.kernel.org/r/20220308134311.59086-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220308134311.59086-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index b3c0577b8095..e86aafea6599 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -345,7 +345,7 @@ static struct file_system_type z3fold_fs = { }; static struct vfsmount *z3fold_mnt; -static int z3fold_mount(void) +static int __init z3fold_mount(void) { int ret = 0; -- cgit v1.2.3 From 78da57d401f8ebe3c15e8e80047eefa87df69ca3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:05 -0700 Subject: mm/z3fold: remove obsolete comment in z3fold_alloc The highmem pages are supported since commit f1549cb5ab2b ("mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc"). Remove the residual comment. Link: https://lkml.kernel.org/r/20220308134311.59086-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index e86aafea6599..87689f50f709 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1064,9 +1064,6 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) * performed first. If no suitable free region is found, then a new page is * allocated and added to the pool to satisfy the request. * - * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used - * as z3fold pool pages. - * * Return: 0 if success and handle is set, otherwise -EINVAL if the size or * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. -- cgit v1.2.3 From ed0e5dcab3a76782f20189e844e6aeb6110c0706 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:05 -0700 Subject: mm/z3fold: minor clean up for z3fold_free Use put_z3fold_header() to pair with get_z3fold_header. Also fix the wrong comments. Minor readability improvement. Link: https://lkml.kernel.org/r/20220308134311.59086-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 87689f50f709..eb89271aea83 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1187,9 +1187,9 @@ headless: * @handle: handle associated with the allocation returned by z3fold_alloc() * * In the case that the z3fold page in which the allocation resides is under - * reclaim, as indicated by the PG_reclaim flag being set, this function - * only sets the first|last_chunks to 0. The page is actually freed - * once both buddies are evicted (see z3fold_reclaim_page() below). + * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function + * only sets the first|middle|last_chunks to 0. The page is actually freed + * once all buddies are evicted (see z3fold_reclaim_page() below). */ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) { @@ -1247,7 +1247,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) } if (page_claimed) { /* the page has not been claimed by us */ - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); return; } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { -- cgit v1.2.3 From 8ea2f86cea6ea9fad665c96431ce634e3284b7d0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:05 -0700 Subject: mm/z3fold: remove unneeded page_mapcount_reset and ClearPagePrivate Page->page_type and PagePrivate are not used in z3fold. We should remove these confusing unneeded operations. The z3fold do these here is due to referring to zsmalloc's migration code which does need these operations. Link: https://lkml.kernel.org/r/20220308134311.59086-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index eb89271aea83..2f848ea45b4d 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -420,7 +420,6 @@ static void free_z3fold_page(struct page *page, bool headless) __ClearPageMovable(page); unlock_page(page); } - ClearPagePrivate(page); __free_page(page); } @@ -1635,7 +1634,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa INIT_LIST_HEAD(&new_zhdr->buddy); new_mapping = page_mapping(page); __ClearPageMovable(page); - ClearPagePrivate(page); get_page(newpage); z3fold_page_lock(new_zhdr); @@ -1655,7 +1653,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - page_mapcount_reset(page); clear_bit(PAGE_CLAIMED, &page->private); put_page(page); return 0; -- cgit v1.2.3 From a3148b5fea52d86b8d1c2445b09ca1033f29fdf0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:05 -0700 Subject: mm/z3fold: remove confusing local variable l reassignment The local variable l holds the address of unbuddied[i] which won't change after we take the pool lock. Remove it to avoid confusion. Link: https://lkml.kernel.org/r/20220308134311.59086-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 2f848ea45b4d..adc0b3fa4906 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -876,7 +876,6 @@ lookup: /* Re-check under lock. */ spin_lock(&pool->lock); - l = &unbuddied[i]; if (unlikely(zhdr != list_first_entry(READ_ONCE(l), struct z3fold_header, buddy)) || !z3fold_page_trylock(zhdr)) { -- cgit v1.2.3 From 5e36c25b2c1aeee16df21d5e4fef22d7ff2b80cf Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:05 -0700 Subject: mm/z3fold: move decrement of pool->pages_nr into __release_z3fold_page() The z3fold will always do atomic64_dec(&pool->pages_nr) when the __release_z3fold_page() is called. Thus we can move decrement of pool->pages_nr into __release_z3fold_page() to simplify the code. Also we can reduce the size of z3fold.o ~1k. Without this patch: text data bss dec hex filename 15444 1376 8 16828 41bc mm/z3fold.o With this patch: text data bss dec hex filename 15044 1248 8 16300 3fac mm/z3fold.o Link: https://lkml.kernel.org/r/20220308134311.59086-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index adc0b3fa4906..18a697f6fe32 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -520,6 +520,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) list_add(&zhdr->buddy, &pool->stale); queue_work(pool->release_wq, &pool->work); spin_unlock(&pool->stale_lock); + + atomic64_dec(&pool->pages_nr); } static void release_z3fold_page(struct kref *ref) @@ -737,13 +739,9 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) return new_zhdr; out_fail: - if (new_zhdr) { - if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) - atomic64_dec(&pool->pages_nr); - else { - add_to_unbuddied(pool, new_zhdr); - z3fold_page_unlock(new_zhdr); - } + if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) { + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); } return NULL; @@ -816,10 +814,8 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { - atomic64_dec(&pool->pages_nr); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) return; - } if (test_bit(PAGE_STALE, &page->private) || test_and_set_bit(PAGE_CLAIMED, &page->private)) { @@ -829,9 +825,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) if (!zhdr->foreign_handles && buddy_single(zhdr) && zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) - atomic64_dec(&pool->pages_nr); - else { + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } @@ -1089,10 +1083,8 @@ retry: if (zhdr) { bud = get_free_buddy(zhdr, chunks); if (bud == HEADLESS) { - if (kref_put(&zhdr->refcount, + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) - atomic64_dec(&pool->pages_nr); - else z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); @@ -1239,10 +1231,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) if (!page_claimed) free_handle(handle, zhdr); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { - atomic64_dec(&pool->pages_nr); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) return; - } if (page_claimed) { /* the page has not been claimed by us */ put_z3fold_header(zhdr); @@ -1353,9 +1343,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) break; } if (!z3fold_page_trylock(zhdr)) { - if (kref_put(&zhdr->refcount, - release_z3fold_page)) - atomic64_dec(&pool->pages_nr); + kref_put(&zhdr->refcount, release_z3fold_page); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1366,10 +1354,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) */ if (zhdr->foreign_handles || test_and_set_bit(PAGE_CLAIMED, &page->private)) { - if (kref_put(&zhdr->refcount, + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) - atomic64_dec(&pool->pages_nr); - else z3fold_page_unlock(zhdr); zhdr = NULL; continue; /* can't evict such page */ @@ -1447,7 +1433,6 @@ next: if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { kmem_cache_free(pool->c_handle, slots); - atomic64_dec(&pool->pages_nr); return 0; } /* @@ -1669,10 +1654,8 @@ static void z3fold_page_putback(struct page *page) if (!list_empty(&zhdr->buddy)) list_del_init(&zhdr->buddy); INIT_LIST_HEAD(&page->lru); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { - atomic64_dec(&pool->pages_nr); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) return; - } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); -- cgit v1.2.3 From 52fb90cc19197f831e5e76e994ae82a29532d89f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:06 -0700 Subject: mm/z3fold: remove redundant list_del_init of zhdr->buddy in z3fold_free do_compact_page() will do list_del_init(&zhdr->buddy) for us. Remove this extra one to save some possible cpu cycles. Link: https://lkml.kernel.org/r/20220308134311.59086-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 18a697f6fe32..867c590df027 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1244,9 +1244,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { - spin_lock(&pool->lock); - list_del_init(&zhdr->buddy); - spin_unlock(&pool->lock); zhdr->cpu = -1; kref_get(&zhdr->refcount); clear_bit(PAGE_CLAIMED, &page->private); -- cgit v1.2.3 From daf79bd8ee1c8187c133c77a82943ae50994bb66 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:06 -0700 Subject: mm/z3fold: remove unneeded PAGE_HEADLESS check in free_handle() The only caller z3fold_free() never calls free_handle() in PAGE_HEADLESS case. Remove this unneeded check. Link: https://lkml.kernel.org/r/20220308134311.59086-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 867c590df027..83b5a3514427 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -297,9 +297,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) int i; bool is_free; - if (handle & (1 << PAGE_HEADLESS)) - return; - if (WARN_ON(*(unsigned long *)handle == 0)) return; -- cgit v1.2.3 From 4af12d04e71c2d0d961c4921602e577d6cc6010e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:06 -0700 Subject: mm: compaction: use helper isolation_suitable() Use helper isolation_suitable() to check whether page is suitable to isolate to simplify the code. Minor readability improvement. Link: https://lkml.kernel.org/r/20220322110750.60311-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index fe915db6149b..baf654b5e073 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -899,7 +899,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * not falsely conclude that the block should be skipped. */ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { - if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { + if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; page = NULL; goto isolate_abort; -- cgit v1.2.3 From da63dc84befaa9e6079a0bc363ff0eaa975f9073 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:06 -0700 Subject: drivers/base/node.c: fix compaction sysfs file leak Compaction sysfs file is created via compaction_register_node in register_node. But we forgot to remove it in unregister_node. Thus compaction sysfs file is leaked. Using compaction_unregister_node to fix this issue. Link: https://lkml.kernel.org/r/20220401070905.43679-1-linmiaohe@huawei.com Fixes: ed4a6d7f0676 ("mm: compaction: add /sys trigger for per-node memory compaction") Signed-off-by: Miaohe Lin Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton --- drivers/base/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index ec8bb24a5a22..0ac6376ef7a1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -682,6 +682,7 @@ static int register_node(struct node *node, int num) */ void unregister_node(struct node *node) { + compaction_unregister_node(node); hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); -- cgit v1.2.3 From bc78b5ed9ff2359f8af1454ee98670b1dc79f06e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:06 -0700 Subject: mm/mempolicy: clean up the code logic in queue_pages_pte_range Since commit e5947d23edd8 ("mm: mempolicy: don't have to split pmd for huge zero page"), THP is never splited in queue_pages_pmd. Thus 2 is never returned now. We can remove such unnecessary ret != 2 check and clean up the relevant comment. Minor improvements in readability. Link: https://lkml.kernel.org/r/20220419122234.45083-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yang Shi Acked-by: David Rientjes Cc: Zi Yan Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/mempolicy.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8c74107a2b15..0288ffaea064 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -440,12 +440,11 @@ static inline bool queue_pages_required(struct page *page, } /* - * queue_pages_pmd() has four possible return values: + * queue_pages_pmd() has three possible return values: * 0 - pages are placed on the right node or queued successfully, or * special page is met, i.e. huge zero page. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. - * 2 - THP was split. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an * existing page was already on a node that does not follow the * policy. @@ -507,18 +506,13 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int ret; bool has_unmovable = false; pte_t *pte, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) { - ret = queue_pages_pmd(pmd, ptl, addr, end, walk); - if (ret != 2) - return ret; - } - /* THP was split, fall through to pte walk */ + if (ptl) + return queue_pages_pmd(pmd, ptl, addr, end, walk); if (pmd_trans_unstable(pmd)) return 0; -- cgit v1.2.3 From 0c2d08728470b93a3f05416f53222f38a89868e2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 28 Apr 2022 23:16:07 -0700 Subject: mm: add selftests for migration entries Add some basic migration tests and in particular tests that will stress both the pte and pmd migration entry wait paths. Link: https://lkml.kernel.org/r/20220324014349.229253-1-apopple@nvidia.com Signed-off-by: Alistair Popple Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox (Oracle) Cc: Ralph Campbell Cc: Muchun Song Cc: John Hubbard Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 3 + tools/testing/selftests/vm/migration.c | 193 +++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 tools/testing/selftests/vm/migration.c diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 04a49e876a46..ff0c7a87785b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -41,6 +41,7 @@ TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate TEST_GEN_FILES += memfd_secret +TEST_GEN_FILES += migration TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mremap_dontunmap @@ -149,6 +150,8 @@ $(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS) $(OUTPUT)/ksm_tests: LDLIBS += -lnuma +$(OUTPUT)/migration: LDLIBS += -lnuma + local_config.mk local_config.h: check_config.sh /bin/sh ./check_config.sh $(CC) diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/vm/migration.c new file mode 100644 index 000000000000..1cec8425e3ca --- /dev/null +++ b/tools/testing/selftests/vm/migration.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The main purpose of the tests here is to exercise the migration entry code + * paths in the kernel. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define TWOMEG (2<<20) +#define RUNTIME (60) + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(migration) +{ + pthread_t *threads; + pid_t *pids; + int nthreads; + int n1; + int n2; +}; + +FIXTURE_SETUP(migration) +{ + int n; + + ASSERT_EQ(numa_available(), 0); + self->nthreads = numa_num_task_cpus() - 1; + self->n1 = -1; + self->n2 = -1; + + for (n = 0; n < numa_max_possible_node(); n++) + if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { + if (self->n1 == -1) { + self->n1 = n; + } else { + self->n2 = n; + break; + } + } + + self->threads = malloc(self->nthreads * sizeof(*self->threads)); + ASSERT_NE(self->threads, NULL); + self->pids = malloc(self->nthreads * sizeof(*self->pids)); + ASSERT_NE(self->pids, NULL); +}; + +FIXTURE_TEARDOWN(migration) +{ + free(self->threads); + free(self->pids); +} + +int migrate(uint64_t *ptr, int n1, int n2) +{ + int ret, tmp; + int status = 0; + struct timespec ts1, ts2; + + if (clock_gettime(CLOCK_MONOTONIC, &ts1)) + return -1; + + while (1) { + if (clock_gettime(CLOCK_MONOTONIC, &ts2)) + return -1; + + if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) + return 0; + + ret = move_pages(0, 1, (void **) &ptr, &n2, &status, + MPOL_MF_MOVE_ALL); + if (ret) { + if (ret > 0) + printf("Didn't migrate %d pages\n", ret); + else + perror("Couldn't migrate pages"); + return -2; + } + + tmp = n2; + n2 = n1; + n1 = tmp; + } + + return 0; +} + +void *access_mem(void *ptr) +{ + uint64_t y = 0; + volatile uint64_t *x = ptr; + + while (1) { + pthread_testcancel(); + y += *x; + } + + return NULL; +} + +/* + * Basic migration entry testing. One thread will move pages back and forth + * between nodes whilst other threads try and access them triggering the + * migration entry wait paths in the kernel. + */ +TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * Same as the previous test but with shared memory. + */ +TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) + access_mem(ptr); + else + self->pids[i] = pid; + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * Tests the pmd migration entry paths. + */ +TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From 5202978b48786703a6cec94596067b3fafd1f734 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:07 -0700 Subject: mm/migration: remove unneeded local variable mapping_locked Patch series "A few cleanup and fixup patches for migration", v2. This series contains a few patches to remove unneeded variables, jump label and use helper to simplify the code. Also we fix some bugs such as page refcounts leak , invalid node access and so on. More details can be found in the respective changelogs. This patch (of 11): When mapping_locked is true, TTU_RMAP_LOCKED is always set to ttu. We can check ttu instead so mapping_locked can be removed. And ttu is either 0 or TTU_RMAP_LOCKED now. Change '|=' to '=' to reflect this. Link: https://lkml.kernel.org/r/20220318111709.60311-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220318111709.60311-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Zi Yan Cc: Baolin Wang Cc: "Huang, Ying" Cc: Muchun Song Cc: Alistair Popple Signed-off-by: Andrew Morton --- mm/migrate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d298544940b8..7d698caf333f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1230,7 +1230,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { - bool mapping_locked = false; enum ttu_flags ttu = 0; if (!PageAnon(hpage)) { @@ -1244,14 +1243,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (unlikely(!mapping)) goto unlock_put_anon; - mapping_locked = true; - ttu |= TTU_RMAP_LOCKED; + ttu = TTU_RMAP_LOCKED; } try_to_migrate(src, ttu); page_was_mapped = 1; - if (mapping_locked) + if (ttu & TTU_RMAP_LOCKED) i_mmap_unlock_write(mapping); } -- cgit v1.2.3 From b75454e10101af3a11b24ca7e14917b6c8874688 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:07 -0700 Subject: mm/migration: remove unneeded local variable page_lru We can use page_is_file_lru() directly to help account the isolated pages to simplify the code a bit. Link: https://lkml.kernel.org/r/20220318111709.60311-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: Baolin Wang Cc: "Huang, Ying" Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 7d698caf333f..a993053cd620 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2013,7 +2013,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { - int page_lru; int nr_pages = thp_nr_pages(page); int order = compound_order(page); @@ -2040,8 +2039,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (isolate_lru_page(page)) return 0; - page_lru = page_is_file_lru(page); - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), nr_pages); /* -- cgit v1.2.3 From cb1c37b1c65d9e5450af2ea6ec8916c5cd23a2e7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:07 -0700 Subject: mm/migration: use helper function vma_lookup() in add_page_for_migration We could use helper function vma_lookup() to lookup the needed vma to simplify the code. Link: https://lkml.kernel.org/r/20220318111709.60311-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: Baolin Wang Cc: "Huang, Ying" Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index a993053cd620..e73e1bfa874b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1604,8 +1604,8 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, mmap_read_lock(mm); err = -EFAULT; - vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + vma = vma_lookup(mm, addr); + if (!vma || !vma_migratable(vma)) goto out; /* FOLL_DUMP to ignore special (like zero) pages */ -- cgit v1.2.3 From 3eefb826c5a627084eccec788f0236a070988dae Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:07 -0700 Subject: mm/migration: use helper macro min in do_pages_stat We could use helper macro min to help set the chunk_nr to simplify the code. Link: https://lkml.kernel.org/r/20220318111709.60311-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Alistair Popple Cc: Baolin Wang Cc: "Huang, Ying" Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index e73e1bfa874b..02448b618c6b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1842,16 +1842,12 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, const void __user * __user *pages, int __user *status) { -#define DO_PAGES_STAT_CHUNK_NR 16 +#define DO_PAGES_STAT_CHUNK_NR 16UL const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; while (nr_pages) { - unsigned long chunk_nr; - - chunk_nr = nr_pages; - if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) - chunk_nr = DO_PAGES_STAT_CHUNK_NR; + unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); if (in_compat_syscall()) { if (get_compat_pages_array(chunk_pages, pages, -- cgit v1.2.3 From 91925ab8cc2a05ab0e524830247e1d66ba4e4e19 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:08 -0700 Subject: mm/migration: avoid unneeded nodemask_t initialization Avoid unneeded next_pass and this_pass initialization as they're always set before using to save possible cpu cycles when there are plenty of nodes in the system. Link: https://lkml.kernel.org/r/20220318111709.60311-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Baolin Wang Cc: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 02448b618c6b..d9f378b623ea 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2345,8 +2345,8 @@ out_clear: */ static void __set_migration_target_nodes(void) { - nodemask_t next_pass = NODE_MASK_NONE; - nodemask_t this_pass = NODE_MASK_NONE; + nodemask_t next_pass; + nodemask_t this_pass; nodemask_t used_targets = NODE_MASK_NONE; int node, best_distance; -- cgit v1.2.3 From f430893b01e78e0b2e21f9bd1633a778c063993e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:08 -0700 Subject: mm/migration: remove some duplicated codes in migrate_pages Remove the duplicated codes in migrate_pages to simplify the code. Minor readability improvement. No functional change intended. Link: https://lkml.kernel.org/r/20220318111709.60311-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Cc: Alistair Popple Cc: "Huang, Ying" Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/migrate.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d9f378b623ea..161a21314210 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1410,14 +1410,11 @@ retry: nr_thp_split++; goto retry; } - - nr_failed_pages += nr_subpages; - break; - } - /* Hugetlb migration is unsupported */ - if (!no_subpage_counting) + } else if (!no_subpage_counting) { nr_failed++; + } + nr_failed_pages += nr_subpages; break; case -ENOMEM: @@ -1432,28 +1429,22 @@ retry: nr_thp_split++; goto retry; } - - nr_failed_pages += nr_subpages; - goto out; + } else if (!no_subpage_counting) { + nr_failed++; } - if (!no_subpage_counting) - nr_failed++; nr_failed_pages += nr_subpages; goto out; case -EAGAIN: - if (is_thp) { + if (is_thp) thp_retry++; - break; - } - retry++; + else + retry++; break; case MIGRATEPAGE_SUCCESS: nr_succeeded += nr_subpages; - if (is_thp) { + if (is_thp) nr_thp_succeeded++; - break; - } break; default: /* @@ -1462,14 +1453,11 @@ retry: * removed from migration page list and not * retried in the next outer loop. */ - if (is_thp) { + if (is_thp) nr_thp_failed++; - nr_failed_pages += nr_subpages; - break; - } - - if (!no_subpage_counting) + else if (!no_subpage_counting) nr_failed++; + nr_failed_pages += nr_subpages; break; } -- cgit v1.2.3 From 69a041ff505806c95b24b8d5cab43e66aacd91e6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:08 -0700 Subject: mm/migration: fix potential page refcounts leak in migrate_pages In -ENOMEM case, there might be some subpages of fail-to-migrate THPs left in thp_split_pages list. We should move them back to migration list so that they could be put back to the right list by the caller otherwise the page refcnt will be leaked here. Also adjust nr_failed and nr_thp_failed accordingly to make vm events account more accurate. Link: https://lkml.kernel.org/r/20220318111709.60311-10-linmiaohe@huawei.com Fixes: b5bade978e9b ("mm: migrate: fix the return value of migrate_pages()") Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Muchun Song Cc: Alistair Popple Signed-off-by: Andrew Morton --- mm/migrate.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 161a21314210..173ba0507049 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1434,6 +1434,14 @@ retry: } nr_failed_pages += nr_subpages; + /* + * There might be some subpages of fail-to-migrate THPs + * left in thp_split_pages list. Move them back to migration + * list so that they could be put back to the right list by + * the caller otherwise the page refcnt will be leaked. + */ + list_splice_init(&thp_split_pages, from); + nr_thp_failed += thp_retry; goto out; case -EAGAIN: if (is_thp) -- cgit v1.2.3 From 3f26c88bd66cd8ab1731763c68df7fe23a7671c0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:08 -0700 Subject: mm/migration: fix potential invalid node access for reclaim-based migration If we failed to setup hotplug state callbacks for mm/demotion:online in some corner cases, node_demotion will be left uninitialized. Invalid node might be returned from the next_demotion_node() when doing reclaim-based migration. Use kcalloc to allocate node_demotion to fix the issue. Link: https://lkml.kernel.org/r/20220318111709.60311-11-linmiaohe@huawei.com Fixes: ac16ec835314 ("mm: migrate: support multiple target nodes demotion") Signed-off-by: Miaohe Lin Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Baolin Wang Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 173ba0507049..cb3fa7c24081 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2479,9 +2479,9 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, void __init migrate_on_reclaim_init(void) { - node_demotion = kmalloc_array(nr_node_ids, - sizeof(struct demotion_nodes), - GFP_KERNEL); + node_demotion = kcalloc(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); WARN_ON(!node_demotion); hotplug_memory_notifier(migrate_on_reclaim_callback, 100); -- cgit v1.2.3 From 4cd614841c06338a087769ee3cfa96718784d1f5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:08 -0700 Subject: mm/migration: fix possible do_pages_stat_array racing with memory offline When follow_page peeks a page, the page could be migrated and then be offlined while it's still being used by the do_pages_stat_array(). Use FOLL_GET to hold the page refcnt to fix this potential race. Link: https://lkml.kernel.org/r/20220318111709.60311-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: "Huang, Ying" Reviewed-by: Muchun Song Cc: Alistair Popple Cc: Baolin Wang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index cb3fa7c24081..35f6416d2953 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1796,13 +1796,18 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_DUMP); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) goto set_status; - err = page ? page_to_nid(page) : -ENOENT; + if (page) { + err = page_to_nid(page); + put_page(page); + } else { + err = -ENOENT; + } set_status: *status = err; -- cgit v1.2.3 From 9c42fe4e30a9b934b1de66c2edca196563221392 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 28 Apr 2022 23:16:09 -0700 Subject: mm: migrate: simplify the refcount validation when migrating hugetlb mapping There is no need to validate the hugetlb page's refcount before trying to freeze the hugetlb page's expected refcount, instead we can just rely on the page_ref_freeze() to simplify the validation. Moreover we are always under the page lock when migrating the hugetlb page mapping, which means nowhere else can remove it from the page cache, so we can remove the xas_load() validation under the i_pages lock. Link: https://lkml.kernel.org/r/eb2fbbeaef2b1714097b9dec457426d682ee0635.1649676424.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Mike Kravetz Cc: Matthew Wilcox Cc: "Huang, Ying" Cc: Muchun Song Cc: Alistair Popple Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 35f6416d2953..f97d724b8fc0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -471,11 +471,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || xas_load(&xas) != page) { - xas_unlock_irq(&xas); - return -EAGAIN; - } - if (!page_ref_freeze(page, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; -- cgit v1.2.3 From 7d6e2d96384556a4f30547803be1f606eb805a62 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 28 Apr 2022 23:16:09 -0700 Subject: mm: untangle config dependencies for demote-on-reclaim At the time demote-on-reclaim was introduced, it was tied to CONFIG_HOTPLUG_CPU + CONFIG_MIGRATE, but that is not really accurate. The only two things we need to depend on are CONFIG_NUMA + CONFIG_MIGRATE, so clean this up. Furthermore, we only register the hotplug memory notifier when the system has CONFIG_MEMORY_HOTPLUG. Link: https://lkml.kernel.org/r/20220322224016.4574-1-osalvador@suse.de Signed-off-by: Oscar Salvador Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Cc: Dave Hansen Cc: Abhishek Goel Cc: Baolin Wang Signed-off-by: Andrew Morton --- include/linux/migrate.h | 34 +++++++++++++++------------------- mm/migrate.c | 11 ++++++----- mm/vmstat.c | 2 -- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 90e75d5a54d6..2707bfd43a0d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -47,17 +47,8 @@ void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); -extern bool numa_demotion_enabled; -extern void migrate_on_reclaim_init(void); -#ifdef CONFIG_HOTPLUG_CPU -extern void set_migration_target_nodes(void); -#else -static inline void set_migration_target_nodes(void) {} -#endif #else -static inline void set_migration_target_nodes(void) {} - static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, @@ -82,9 +73,23 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, return -ENOSYS; } -#define numa_demotion_enabled false #endif /* CONFIG_MIGRATION */ +#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA) +extern void set_migration_target_nodes(void); +extern void migrate_on_reclaim_init(void); +extern bool numa_demotion_enabled; +extern int next_demotion_node(int node); +#else +static inline void set_migration_target_nodes(void) {} +static inline void migrate_on_reclaim_init(void) {} +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} +#define numa_demotion_enabled false +#endif + #ifdef CONFIG_COMPACTION extern int PageMovable(struct page *page); extern void __SetPageMovable(struct page *page, struct address_space *mapping); @@ -172,15 +177,6 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); -int next_demotion_node(int node); - -#else /* CONFIG_MIGRATION disabled: */ - -static inline int next_demotion_node(int node) -{ - return NUMA_NO_NODE; -} - #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate.c b/mm/migrate.c index f97d724b8fc0..77592288fbc0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2104,7 +2104,6 @@ out: return 0; } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_NUMA */ /* * node_demotion[] example: @@ -2238,7 +2237,6 @@ out: return target; } -#if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { @@ -2431,6 +2429,7 @@ void set_migration_target_nodes(void) * __set_migration_target_nodes() can be used as opposed to * set_migration_target_nodes(). */ +#ifdef CONFIG_MEMORY_HOTPLUG static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, unsigned long action, void *_arg) { @@ -2476,6 +2475,7 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, return notifier_from_errno(0); } +#endif void __init migrate_on_reclaim_init(void) { @@ -2483,8 +2483,9 @@ void __init migrate_on_reclaim_init(void) sizeof(struct demotion_nodes), GFP_KERNEL); WARN_ON(!node_demotion); - +#ifdef CONFIG_MEMORY_HOTPLUG hotplug_memory_notifier(migrate_on_reclaim_callback, 100); +#endif /* * At this point, all numa nodes with memory/CPus have their state * properly set, so we can build the demotion order now. @@ -2495,7 +2496,6 @@ void __init migrate_on_reclaim_init(void) set_migration_target_nodes(); cpus_read_unlock(); } -#endif /* CONFIG_HOTPLUG_CPU */ bool numa_demotion_enabled = false; @@ -2556,4 +2556,5 @@ delete_obj: return err; } subsys_initcall(numa_init_sysfs); -#endif +#endif /* CONFIG_SYSFS */ +#endif /* CONFIG_NUMA */ diff --git a/mm/vmstat.c b/mm/vmstat.c index b75b1a64b54c..f2d0dec1062d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2111,9 +2111,7 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif -#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU) migrate_on_reclaim_init(); -#endif #ifdef CONFIG_PROC_FS proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); -- cgit v1.2.3 From f3b9e8cc8b09ba3b41bb068c24a1061e8a70d26f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:09 -0700 Subject: mm/madvise: fix potential pte_unmap_unlock pte error We can't assume pte_offset_map_lock will return same orig_pte value. So it's necessary to reacquire the orig_pte or pte_unmap_unlock will unmap the stale pte. Link: https://lkml.kernel.org/r/20220416081416.23304-1-linmiaohe@huawei.com Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD") Fixes: 854e9ed09ded ("mm: support madvise(MADV_FREE)") Signed-off-by: Miaohe Lin Cc: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/madvise.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 1873616a37d2..c965fac7b13a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -432,12 +432,12 @@ regular_page: if (split_huge_page(page)) { unlock_page(page); put_page(page); - pte_offset_map_lock(mm, pmd, addr, &ptl); + orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); break; } unlock_page(page); put_page(page); - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; continue; @@ -648,12 +648,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (split_huge_page(page)) { unlock_page(page); put_page(page); - pte_offset_map_lock(mm, pmd, addr, &ptl); + orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); goto out; } unlock_page(page); put_page(page); - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; continue; -- cgit v1.2.3 From 7f9c9b607dc2641c77266efe788fab853d27dbd1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:09 -0700 Subject: mm: rmap: fix cache flush on THP pages Patch series "Fix some bugs related to ramp and dax", v7. Patch 1-2 fix a cache flush bug, because subsequent patches depend on those on those changes, there are placed in this series. Patch 3-4 are preparation for fixing a dax bug in patch 5. Patch 6 is code cleanup since the previous patch removes the usage of follow_invalidate_pte(). This patch (of 6): The flush_cache_page() only remove a PAGE_SIZE sized range from the cache. However, it does not cover the full pages in a THP except a head page. Replace it with flush_cache_range() to fix this issue. At least, no problems were found due to this. Maybe because the architectures that have virtual indexed caches is less. Link: https://lkml.kernel.org/r/20220403053957.10770-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220403053957.10770-2-songmuchun@bytedance.com Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()") Signed-off-by: Muchun Song Reviewed-by: Yang Shi Reviewed-by: Dan Williams Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Jan Kara Cc: Al Viro Cc: Alistair Popple Cc: Yang Shi Cc: Ralph Campbell Cc: Hugh Dickins Cc: Xiyu Yang Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index fedb82371efe..c194ec913c68 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -970,7 +970,8 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; - flush_cache_page(vma, address, folio_pfn(folio)); + flush_cache_range(vma, address, + address + HPAGE_PMD_SIZE); entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); -- cgit v1.2.3 From e583b5c472bd23d450e06f148dc1f37be74f7666 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:09 -0700 Subject: dax: fix cache flush on PMD-mapped pages The flush_cache_page() only remove a PAGE_SIZE sized range from the cache. However, it does not cover the full pages in a THP except a head page. Replace it with flush_cache_range() to fix this issue. This is just a documentation issue with the respect to properly documenting the expected usage of cache flushing before modifying the pmd. However, in practice this is not a problem due to the fact that DAX is not available on architectures with virtually indexed caches per: commit d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Link: https://lkml.kernel.org/r/20220403053957.10770-3-songmuchun@bytedance.com Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean") Signed-off-by: Muchun Song Reviewed-by: Dan Williams Reviewed-by: Christoph Hellwig Cc: Alistair Popple Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- fs/dax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 67a08a32fccb..a372304c9695 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -845,7 +845,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) goto unlock_pmd; - flush_cache_page(vma, address, pfn); + flush_cache_range(vma, address, + address + HPAGE_PMD_SIZE); pmd = pmdp_invalidate(vma, address, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); -- cgit v1.2.3 From 6a8e0596f00469c15ec556b9f3624acd2e9a04f9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: mm: rmap: introduce pfn_mkclean_range() to cleans PTEs The page_mkclean_one() is supposed to be used with the pfn that has a associated struct page, but not all the pfns (e.g. DAX) have a struct page. Introduce a new function pfn_mkclean_range() to cleans the PTEs (including PMDs) mapped with range of pfns which has no struct page associated with them. This helper will be used by DAX device in the next patch to make pfns clean. Link: https://lkml.kernel.org/r/20220403053957.10770-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alistair Popple Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ mm/internal.h | 26 +++++++++++++-------- mm/rmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 74 insertions(+), 20 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 17230c458341..8573aae50d96 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -261,6 +261,9 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int folio_mkclean(struct folio *); +int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, + struct vm_area_struct *vma); + void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); /* diff --git a/mm/internal.h b/mm/internal.h index 6293468fd7b7..69a5eabe0943 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -525,26 +525,22 @@ void mlock_page_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* - * At what user virtual address is page expected in vma? - * Returns -EFAULT if all of the page is outside the range of vma. - * If page is a compound head, the entire compound page is considered. + * Return the start of user virtual address at the specific offset within + * a vma. */ static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, + struct vm_area_struct *vma) { - pgoff_t pgoff; unsigned long address; - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - pgoff = page_to_pgoff(page); if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; - } else if (PageHead(page) && - pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { + } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { @@ -553,6 +549,18 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +/* + * Return the start of user virtual address of a page within a vma. + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. + */ +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); +} + /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. diff --git a/mm/rmap.c b/mm/rmap.c index c194ec913c68..ad509b3a89ee 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -929,12 +929,12 @@ int folio_referenced(struct folio *folio, int is_locked, return pra.referenced; } -static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, - unsigned long address, void *arg) +static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) { - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); + int cleaned = 0; + struct vm_area_struct *vma = pvmw->vma; struct mmu_notifier_range range; - int *cleaned = arg; + unsigned long address = pvmw->address; /* * We have to assume the worse case ie pmd for invalidation. Note that @@ -942,16 +942,16 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - vma_address_end(&pvmw)); + vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); - while (page_vma_mapped_walk(&pvmw)) { + while (page_vma_mapped_walk(pvmw)) { int ret = 0; - address = pvmw.address; - if (pvmw.pte) { + address = pvmw->address; + if (pvmw->pte) { pte_t entry; - pte_t *pte = pvmw.pte; + pte_t *pte = pvmw->pte; if (!pte_dirty(*pte) && !pte_write(*pte)) continue; @@ -964,7 +964,7 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - pmd_t *pmd = pvmw.pmd; + pmd_t *pmd = pvmw->pmd; pmd_t entry; if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) @@ -991,11 +991,22 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ if (ret) - (*cleaned)++; + cleaned++; } mmu_notifier_invalidate_range_end(&range); + return cleaned; +} + +static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); + int *cleaned = arg; + + *cleaned += page_vma_mkclean_one(&pvmw); + return true; } @@ -1032,6 +1043,38 @@ int folio_mkclean(struct folio *folio) } EXPORT_SYMBOL_GPL(folio_mkclean); +/** + * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of + * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) + * within the @vma of shared mappings. And since clean PTEs + * should also be readonly, write protects them too. + * @pfn: start pfn. + * @nr_pages: number of physically contiguous pages srarting with @pfn. + * @pgoff: page offset that the @pfn mapped with. + * @vma: vma that @pfn mapped within. + * + * Returns the number of cleaned PTEs (including PMDs). + */ +int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, + struct vm_area_struct *vma) +{ + struct page_vma_mapped_walk pvmw = { + .pfn = pfn, + .nr_pages = nr_pages, + .pgoff = pgoff, + .vma = vma, + .flags = PVMW_SYNC, + }; + + if (invalid_mkclean_vma(vma, NULL)) + return 0; + + pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); + VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); + + return page_vma_mkclean_one(&pvmw); +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma -- cgit v1.2.3 From 6472f6d2f7d90c0e8b32bd13acd45622635b04cc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: mm: pvmw: add support for walking devmap pages The devmap pages can not use page_vma_mapped_walk() to check if a huge devmap page is mapped into a vma. Add support for walking huge devmap pages so that DAX can use it in the next patch. Link: https://lkml.kernel.org/r/20220403053957.10770-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alistair Popple Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 14a5cda73dee..c10f839fc410 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -210,16 +210,10 @@ restart: */ pmde = READ_ONCE(*pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) || + (pmd_present(pmde) && pmd_devmap(pmde))) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); pmde = *pvmw->pmd; - if (likely(pmd_trans_huge(pmde))) { - if (pvmw->flags & PVMW_MIGRATION) - return not_found(pvmw); - if (!check_pmd(pmd_pfn(pmde), pvmw)) - return not_found(pvmw); - return true; - } if (!pmd_present(pmde)) { swp_entry_t entry; @@ -232,6 +226,13 @@ restart: return not_found(pvmw); return true; } + if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) { + if (pvmw->flags & PVMW_MIGRATION) + return not_found(pvmw); + if (!check_pmd(pmd_pfn(pmde), pvmw)) + return not_found(pvmw); + return true; + } /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); pvmw->ptl = NULL; -- cgit v1.2.3 From 06083a0921fd5939223a8bf30e83d5f483d348dc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: dax: fix missing writeprotect the pte entry Currently dax_mapping_entry_mkclean() fails to clean and write protect the pte entry within a DAX PMD entry during an *sync operation. This can result in data loss in the following sequence: 1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and making the pmd entry dirty and writeable. 2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K) write to the same file, dirtying PMD radix tree entry (already done in 1)) and making the pte entry dirty and writeable. 3) fsync, flushing out PMD data and cleaning the radix tree entry. We currently fail to mark the pte entry as clean and write protected since the vma of process B is not covered in dax_entry_mkclean(). 4) process B writes to the pte. These don't cause any page faults since the pte entry is dirty and writeable. The radix tree entry remains clean. 5) fsync, which fails to flush the dirty PMD data because the radix tree entry was clean. 6) crash - dirty data that should have been fsync'd as part of 5) could still have been in the processor cache, and is lost. Just to use pfn_mkclean_range() to clean the pfns to fix this issue. Link: https://lkml.kernel.org/r/20220403053957.10770-6-songmuchun@bytedance.com Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush") Signed-off-by: Muchun Song Reviewed-by: Christoph Hellwig Cc: Alistair Popple Cc: Al Viro Cc: Dan Williams Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- fs/dax.c | 99 ++++++++-------------------------------------------------------- 1 file changed, 12 insertions(+), 87 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index a372304c9695..1ac12e877f4f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -789,96 +790,12 @@ static void *dax_insert_entry(struct xa_state *xas, return entry; } -static inline -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) -{ - unsigned long address; - - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - return address; -} - -/* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, - unsigned long pfn) -{ - struct vm_area_struct *vma; - pte_t pte, *ptep = NULL; - pmd_t *pmdp = NULL; - spinlock_t *ptl; - - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - struct mmu_notifier_range range; - unsigned long address; - - cond_resched(); - - if (!(vma->vm_flags & VM_SHARED)) - continue; - - address = pgoff_address(index, vma); - - /* - * follow_invalidate_pte() will use the range to call - * mmu_notifier_invalidate_range_start() on our behalf before - * taking any lock. - */ - if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, - &pmdp, &ptl)) - continue; - - /* - * No need to call mmu_notifier_invalidate_range() as we are - * downgrading page table protection not changing it to point - * to a new page. - * - * See Documentation/vm/mmu_notifier.rst - */ - if (pmdp) { -#ifdef CONFIG_FS_DAX_PMD - pmd_t pmd; - - if (pfn != pmd_pfn(*pmdp)) - goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) - goto unlock_pmd; - - flush_cache_range(vma, address, - address + HPAGE_PMD_SIZE); - pmd = pmdp_invalidate(vma, address, pmdp); - pmd = pmd_wrprotect(pmd); - pmd = pmd_mkclean(pmd); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); -unlock_pmd: -#endif - spin_unlock(ptl); - } else { - if (pfn != pte_pfn(*ptep)) - goto unlock_pte; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock_pte; - - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); -unlock_pte: - pte_unmap_unlock(ptep, ptl); - } - - mmu_notifier_invalidate_range_end(&range); - } - i_mmap_unlock_read(mapping); -} - static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn, index, count; + unsigned long pfn, index, count, end; long ret = 0; + struct vm_area_struct *vma; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -936,8 +853,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, pfn = dax_to_pfn(entry); count = 1UL << dax_entry_order(entry); index = xas->xa_index & ~(count - 1); + end = index + count - 1; + + /* Walk all mappings of a given index of a file and writeprotect them */ + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { + pfn_mkclean_range(pfn, count, index, vma); + cond_resched(); + } + i_mmap_unlock_read(mapping); - dax_entry_mkclean(mapping, index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There -- cgit v1.2.3 From 0e5e64c0b0d7bb33a5e971ad17e771cf6e0a1127 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: mm: simplify follow_invalidate_pte() The only user (DAX) of range and pmdpp parameters of follow_invalidate_pte() is gone, it is safe to remove them and make it static to simlify the code. This is revertant of the following commits: 097963959594 ("mm: add follow_pte_pmd()") a4d1a8852513 ("dax: update to new mmu_notifier semantic") There is only one caller of the follow_invalidate_pte(). So just fold it into follow_pte() and remove it. Link: https://lkml.kernel.org/r/20220403053957.10770-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Christoph Hellwig Cc: Alistair Popple Cc: Al Viro Cc: Dan Williams Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 -- mm/memory.c | 81 ++++++++++++++++-------------------------------------- 2 files changed, 23 insertions(+), 61 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d446e834a3e5..16d6b46d2d3e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1845,9 +1845,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, - pmd_t **pmdpp, spinlock_t **ptlp); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 76e3af9639d9..8895dd2f2255 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4949,9 +4949,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, - pmd_t **pmdpp, spinlock_t **ptlp) +/** + * follow_pte - look up PTE at a user virtual address + * @mm: the mm_struct of the target address space + * @address: user virtual address + * @ptepp: location to store found PTE + * @ptlp: location to store the lock for the PTE + * + * On a successful return, the pointer to the PTE is stored in @ptepp; + * the corresponding lock is taken and its location is stored in @ptlp. + * The contents of the PTE are only stable until @ptlp is released; + * any further use, if any, must be protected against invalidation + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + * it is not a good general-purpose API. + * + * Return: zero on success, -ve otherwise. + */ +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4974,35 +4994,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_huge(*pmd)) { - if (!pmdpp) - goto out; - - if (range) { - mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, - NULL, mm, address & PMD_MASK, - (address & PMD_MASK) + PMD_SIZE); - mmu_notifier_invalidate_range_start(range); - } - *ptlp = pmd_lock(mm, pmd); - if (pmd_huge(*pmd)) { - *pmdpp = pmd; - return 0; - } - spin_unlock(*ptlp); - if (range) - mmu_notifier_invalidate_range_end(range); - } - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; - if (range) { - mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, - address & PAGE_MASK, - (address & PAGE_MASK) + PAGE_SIZE); - mmu_notifier_invalidate_range_start(range); - } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) goto unlock; @@ -5010,38 +5004,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); - if (range) - mmu_notifier_invalidate_range_end(range); out: return -EINVAL; } - -/** - * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space - * @address: user virtual address - * @ptepp: location to store found PTE - * @ptlp: location to store the lock for the PTE - * - * On a successful return, the pointer to the PTE is stored in @ptepp; - * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. - * - * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore - * should be taken for read. - * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. - * - * Return: zero on success, -ve otherwise. - */ -int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) -{ - return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp); -} EXPORT_SYMBOL_GPL(follow_pte); /** -- cgit v1.2.3 From 62e80f2b5072ed80a41fc6a272e44e8e17fdcf66 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: tools/testing/selftests/vm/gup_test.c: clarify error statement Print three possible reasons /sys/kernel/debug/gup_test cannot be opened to help users of this test diagnose failures. Link: https://lkml.kernel.org/r/20220405214809.3351223-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/gup_test.c | 22 +++++++++++++++++++-- tools/testing/selftests/vm/run_vmtests.sh | 33 ++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index cda837a14736..593262555e18 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include #include #include #include @@ -9,6 +11,7 @@ #include #include #include "../../../../mm/gup_test.h" +#include "../kselftest.h" #include "util.h" @@ -206,8 +209,23 @@ int main(int argc, char **argv) gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); if (gup_fd == -1) { - perror("open"); - exit(1); + switch (errno) { + case EACCES: + if (getuid()) + printf("Please run this test as root\n"); + break; + case ENOENT: + if (opendir("/sys/kernel/debug") == NULL) { + printf("mount debugfs at /sys/kernel/debug\n"); + break; + } + printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); + break; + default: + perror("failed to open /sys/kernel/debug/gup_test"); + break; + } + exit(KSFT_SKIP); } p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 352ba00cf26b..8865ff365cc6 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -162,22 +162,32 @@ echo "------------------------------------------------------" echo "running: gup_test -u # get_user_pages_fast() benchmark" echo "------------------------------------------------------" ./gup_test -u -if [ $? -ne 0 ]; then +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else echo "[FAIL]" exitcode=1 -else - echo "[PASS]" fi echo "------------------------------------------------------" echo "running: gup_test -a # pin_user_pages_fast() benchmark" echo "------------------------------------------------------" ./gup_test -a -if [ $? -ne 0 ]; then +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else echo "[FAIL]" exitcode=1 -else - echo "[PASS]" fi echo "------------------------------------------------------------" @@ -185,11 +195,16 @@ echo "# Dump pages 0, 19, and 4096, using pin_user_pages:" echo "running: gup_test -ct -F 0x1 0 19 0x1000 # dump_page() test" echo "------------------------------------------------------------" ./gup_test -ct -F 0x1 0 19 0x1000 -if [ $? -ne 0 ]; then +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else echo "[FAIL]" exitcode=1 -else - echo "[PASS]" fi echo "-------------------" -- cgit v1.2.3 From 642bc52aed9c99e8c9c9cfb6781f77719717a36c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 28 Apr 2022 23:16:11 -0700 Subject: selftests: vm: bring common functions to a new file Bring common functions to a new file while keeping code as much same as possible. These functions can be used in the new tests. This helps in avoiding code duplication. Link: https://lkml.kernel.org/r/20220420084036.4101604-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Acked-by: David Hildenbrand Cc: Gabriel Krisman Bertazi Cc: Shuah Khan Cc: Will Deacon Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 7 +- tools/testing/selftests/vm/madv_populate.c | 34 +------ tools/testing/selftests/vm/split_huge_page_test.c | 79 +--------------- tools/testing/selftests/vm/vm_util.c | 108 ++++++++++++++++++++++ tools/testing/selftests/vm/vm_util.h | 9 ++ 5 files changed, 124 insertions(+), 113 deletions(-) create mode 100644 tools/testing/selftests/vm/vm_util.c create mode 100644 tools/testing/selftests/vm/vm_util.h diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index ff0c7a87785b..6fd967839ccd 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -36,7 +36,7 @@ TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += khugepaged -TEST_GEN_FILES += madv_populate +TEST_GEN_PROGS = madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate @@ -50,7 +50,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += split_huge_page_test +TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests ifeq ($(MACHINE),x86_64) @@ -94,6 +94,9 @@ TEST_FILES := test_vmalloc.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk +$(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/split_huge_page_test: vm_util.c + ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c index 3ee0e8275600..715a42e8e2cd 100644 --- a/tools/testing/selftests/vm/madv_populate.c +++ b/tools/testing/selftests/vm/madv_populate.c @@ -18,6 +18,7 @@ #include #include "../kselftest.h" +#include "vm_util.h" /* * For now, we're using 2 MiB of private anonymous memory for all tests. @@ -26,18 +27,6 @@ static size_t pagesize; -static uint64_t pagemap_get_entry(int fd, char *start) -{ - const unsigned long pfn = (unsigned long)start / pagesize; - uint64_t entry; - int ret; - - ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); - if (ret != sizeof(entry)) - ksft_exit_fail_msg("reading pagemap failed\n"); - return entry; -} - static bool pagemap_is_populated(int fd, char *start) { uint64_t entry = pagemap_get_entry(fd, start); @@ -46,13 +35,6 @@ static bool pagemap_is_populated(int fd, char *start) return entry & 0xc000000000000000ull; } -static bool pagemap_is_softdirty(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - return entry & 0x0080000000000000ull; -} - static void sense_support(void) { char *addr; @@ -258,20 +240,6 @@ static bool range_is_not_softdirty(char *start, ssize_t size) return ret; } -static void clear_softdirty(void) -{ - int fd = open("/proc/self/clear_refs", O_WRONLY); - const char *ctrl = "4"; - int ret; - - if (fd < 0) - ksft_exit_fail_msg("opening clear_refs failed\n"); - ret = write(fd, ctrl, strlen(ctrl)); - if (ret != strlen(ctrl)) - ksft_exit_fail_msg("writing clear_refs failed\n"); - close(fd); -} - static void test_softdirty(void) { char *addr; diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 52497b7b9f1d..6aa2b8253aed 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -16,14 +16,13 @@ #include #include #include +#include "vm_util.h" uint64_t pagesize; unsigned int pageshift; uint64_t pmd_pagesize; -#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" -#define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 #define PID_FMT "%d,0x%lx,0x%lx" @@ -51,30 +50,6 @@ int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) return 0; } - -static uint64_t read_pmd_pagesize(void) -{ - int fd; - char buf[20]; - ssize_t num_read; - - fd = open(PMD_SIZE_PATH, O_RDONLY); - if (fd == -1) { - perror("Open hpage_pmd_size failed"); - exit(EXIT_FAILURE); - } - num_read = read(fd, buf, 19); - if (num_read < 1) { - close(fd); - perror("Read hpage_pmd_size failed"); - exit(EXIT_FAILURE); - } - buf[num_read] = '\0'; - close(fd); - - return strtoul(buf, NULL, 10); -} - static int write_file(const char *path, const char *buf, size_t buflen) { int fd; @@ -113,58 +88,6 @@ static void write_debugfs(const char *fmt, ...) } } -#define MAX_LINE_LENGTH 500 - -static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) -{ - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - -static uint64_t check_huge(void *addr) -{ - uint64_t thp = 0; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(SMAP_PATH, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, SMAP_PATH); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer)) - goto err_out; - - /* - * Fetch the AnonHugePages: in the same block and check the number of - * hugepages. - */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) - goto err_out; - - if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) { - printf("Reading smap error\n"); - exit(EXIT_FAILURE); - } - -err_out: - fclose(fp); - return thp; -} - void split_pmd_thp(void) { char *one_page; diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c new file mode 100644 index 000000000000..b58ab11a7a30 --- /dev/null +++ b/tools/testing/selftests/vm/vm_util.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../kselftest.h" +#include "vm_util.h" + +#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SMAP_FILE_PATH "/proc/self/smaps" +#define MAX_LINE_LENGTH 500 + +uint64_t pagemap_get_entry(int fd, char *start) +{ + const unsigned long pfn = (unsigned long)start / getpagesize(); + uint64_t entry; + int ret; + + ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); + if (ret != sizeof(entry)) + ksft_exit_fail_msg("reading pagemap failed\n"); + return entry; +} + +bool pagemap_is_softdirty(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + // Check if dirty bit (55th bit) is set + return entry & 0x0080000000000000ull; +} + +void clear_softdirty(void) +{ + int ret; + const char *ctrl = "4"; + int fd = open("/proc/self/clear_refs", O_WRONLY); + + if (fd < 0) + ksft_exit_fail_msg("opening clear_refs failed\n"); + ret = write(fd, ctrl, strlen(ctrl)); + close(fd); + if (ret != strlen(ctrl)) + ksft_exit_fail_msg("writing clear_refs failed\n"); +} + +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); + if (fd == -1) + ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); + + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +uint64_t check_huge(void *addr) +{ + uint64_t thp = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); + + fp = fopen(SMAP_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); + + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + /* + * Fetch the AnonHugePages: in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) + ksft_exit_fail_msg("Reading smap error\n"); + +err_out: + fclose(fp); + return thp; +} diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h new file mode 100644 index 000000000000..2e512bd57ae1 --- /dev/null +++ b/tools/testing/selftests/vm/vm_util.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +uint64_t pagemap_get_entry(int fd, char *start); +bool pagemap_is_softdirty(int fd, char *start); +void clear_softdirty(void); +uint64_t read_pmd_pagesize(void); +uint64_t check_huge(void *addr); -- cgit v1.2.3 From 9f3265db6ae87de27a5e382410b8eb9af53b161e Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 28 Apr 2022 23:16:11 -0700 Subject: selftests: vm: add test for Soft-Dirty PTE bit This introduces three tests: 1) Sanity check soft dirty basic semantics: allocate area, clean, dirty, check if the SD bit is flipped. 2) Check VMA reuse: validate the VM_SOFTDIRTY usage 3) Check soft-dirty on huge pages This was motivated by Will Deacon's fix commit 912efa17e512 ("mm: proc: Invalidate TLB after clearing soft-dirty page state"). I was tracking the same issue that he fixed, and this test would have caught it. Link: https://lkml.kernel.org/r/20220420084036.4101604-2-usama.anjum@collabora.com Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Muhammad Usama Anjum Co-developed-by: Muhammad Usama Anjum Cc: Will Deacon Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 2 + tools/testing/selftests/vm/config | 2 + tools/testing/selftests/vm/soft-dirty.c | 145 ++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+) create mode 100644 tools/testing/selftests/vm/soft-dirty.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index d7507f3c7c76..3cb4fa771ec2 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -29,5 +29,6 @@ write_to_hugetlbfs hmm-tests memfd_secret local_config.* +soft-dirty split_huge_page_test ksm_tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 6fd967839ccd..f1228370e99b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -50,6 +50,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests @@ -95,6 +96,7 @@ KSFT_KHDR_INSTALL := 1 include ../lib.mk $(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c ifeq ($(MACHINE),x86_64) diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config index 60e82da0de85..be087c4bc396 100644 --- a/tools/testing/selftests/vm/config +++ b/tools/testing/selftests/vm/config @@ -4,3 +4,5 @@ CONFIG_TEST_VMALLOC=m CONFIG_DEVICE_PRIVATE=y CONFIG_TEST_HMM=m CONFIG_GUP_TEST=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEM_SOFT_DIRTY=y diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c new file mode 100644 index 000000000000..08ab62a4a9d0 --- /dev/null +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include "vm_util.h" + +#define PAGEMAP_FILE_PATH "/proc/self/pagemap" +#define TEST_ITERATIONS 10000 + +static void test_simple(int pagemap_fd, int pagesize) +{ + int i; + char *map; + + map = aligned_alloc(pagesize, pagesize); + if (!map) + ksft_exit_fail_msg("mmap failed\n"); + + clear_softdirty(); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + clear_softdirty(); + } + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +static void test_vma_reuse(int pagemap_fd, int pagesize) +{ + char *map, *map2; + + map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // The kernel always marks new regions as soft dirty + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s dirty bit of allocated page\n", __func__); + + clear_softdirty(); + munmap(map, pagesize); + + map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // Dirty bit is set for new regions even if they are reused + if (map == map2) + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s dirty bit of reused address page\n", __func__); + else + ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); + + munmap(map2, pagesize); +} + +static void test_hugepage(int pagemap_fd, int pagesize) +{ + char *map; + int i, ret; + size_t hpage_len = read_pmd_pagesize(); + + map = memalign(hpage_len, hpage_len); + if (!map) + ksft_exit_fail_msg("memalign failed\n"); + + ret = madvise(map, hpage_len, MADV_HUGEPAGE); + if (ret) + ksft_exit_fail_msg("madvise failed %d\n", ret); + + for (i = 0; i < hpage_len; i++) + map[i] = (char)i; + + if (check_huge(map)) { + ksft_test_result_pass("Test %s huge page allocation\n", __func__); + + clear_softdirty(); + for (i = 0 ; i < TEST_ITERATIONS ; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + clear_softdirty(); + } + + ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); + } else { + // hugepage allocation failed. skip these tests + ksft_test_result_skip("Test %s huge page allocation\n", __func__); + ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); + } + free(map); +} + +int main(int argc, char **argv) +{ + int pagemap_fd; + int pagesize; + + ksft_print_header(); + ksft_set_plan(5); + + pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); + + pagesize = getpagesize(); + + test_simple(pagemap_fd, pagesize); + test_vma_reuse(pagemap_fd, pagesize); + test_hugepage(pagemap_fd, pagesize); + + close(pagemap_fd); + + return ksft_exit_pass(); +} -- cgit v1.2.3 From b67bd551201a3e2c7e1def84980e9b2f0b3a3c77 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Thu, 28 Apr 2022 23:16:11 -0700 Subject: selftests: vm: refactor run_vmtests.sh to reduce boilerplate Previously, each test printed out its own header, dealt with its own return code, etc. By just putting this standard stuff in a function, we can delete > 300 lines from the script. This also makes adding future tests easier. And, it gets rid of various inconsistencies that already exist: - Some tests correctly deal with ksft_skip, but others don't. - Some tests just print the executable name, others print arguments, and yet others print some comment in the header. - Most tests print out a header with two separator lines, but not the HMM smoke test or the memfd_secret test, which only print one. - We had a redundant "exit" at the end, with all the boilerplate it's an easy oversight. Link: https://lkml.kernel.org/r/20220421224928.1848230-1-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 479 ++++-------------------------- 1 file changed, 64 insertions(+), 415 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 8865ff365cc6..2d5a3da42cbe 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -66,467 +66,116 @@ fi VADDR64=0 echo "$ARCH64STR" | grep $ARCH && VADDR64=1 +# Usage: run_test [test binary] [arbitrary test arguments...] +run_test() { + local title="running $*" + local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) + printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" + + "$@" + local ret=$? + if [ $ret -eq 0 ]; then + echo "[PASS]" + elif [ $ret -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip + else + echo "[FAIL]" + exitcode=1 + fi +} + mkdir $mnt mount -t hugetlbfs none $mnt -echo "---------------------" -echo "running hugepage-mmap" -echo "---------------------" -./hugepage-mmap -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./hugepage-mmap shmmax=`cat /proc/sys/kernel/shmmax` shmall=`cat /proc/sys/kernel/shmall` echo 268435456 > /proc/sys/kernel/shmmax echo 4194304 > /proc/sys/kernel/shmall -echo "--------------------" -echo "running hugepage-shm" -echo "--------------------" -./hugepage-shm -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./hugepage-shm echo $shmmax > /proc/sys/kernel/shmmax echo $shmall > /proc/sys/kernel/shmall -echo "-------------------" -echo "running map_hugetlb" -echo "-------------------" -./map_hugetlb -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./map_hugetlb -echo "-----------------------" -echo "running hugepage-mremap" -echo "-----------------------" -./hugepage-mremap $mnt/huge_mremap -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./hugepage-mremap $mnt/huge_mremap rm -f $mnt/huge_mremap -echo "------------------------" -echo "running hugepage-vmemmap" -echo "------------------------" -./hugepage-vmemmap -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./hugepage-vmemmap -echo "-----------------------" -echo "running hugetlb-madvise" -echo "-----------------------" -./hugetlb-madvise $mnt/madvise-test -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./hugetlb-madvise $mnt/madvise-test rm -f $mnt/madvise-test echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." -echo "---------------------------" -echo "running map_fixed_noreplace" -echo "---------------------------" -./map_fixed_noreplace -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi - -echo "------------------------------------------------------" -echo "running: gup_test -u # get_user_pages_fast() benchmark" -echo "------------------------------------------------------" -./gup_test -u -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +run_test ./map_fixed_noreplace -echo "------------------------------------------------------" -echo "running: gup_test -a # pin_user_pages_fast() benchmark" -echo "------------------------------------------------------" -./gup_test -a -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +# get_user_pages_fast() benchmark +run_test ./gup_test -u +# pin_user_pages_fast() benchmark +run_test ./gup_test -a +# Dump pages 0, 19, and 4096, using pin_user_pages: +run_test ./gup_test -ct -F 0x1 0 19 0x1000 -echo "------------------------------------------------------------" -echo "# Dump pages 0, 19, and 4096, using pin_user_pages:" -echo "running: gup_test -ct -F 0x1 0 19 0x1000 # dump_page() test" -echo "------------------------------------------------------------" -./gup_test -ct -F 0x1 0 19 0x1000 -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "-------------------" -echo "running userfaultfd" -echo "-------------------" -./userfaultfd anon 20 16 -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi - -echo "---------------------------" -echo "running userfaultfd_hugetlb" -echo "---------------------------" +run_test ./userfaultfd anon 20 16 # Test requires source and destination huge pages. Size of source # (half_ufd_size_MB) is passed as argument to test. -./userfaultfd hugetlb $half_ufd_size_MB 32 -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi - -echo "-------------------------" -echo "running userfaultfd_shmem" -echo "-------------------------" -./userfaultfd shmem 20 16 -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./userfaultfd hugetlb $half_ufd_size_MB 32 +run_test ./userfaultfd shmem 20 16 #cleanup umount $mnt rm -rf $mnt echo $nr_hugepgs > /proc/sys/vm/nr_hugepages -echo "-----------------------" -echo "running compaction_test" -echo "-----------------------" -./compaction_test -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./compaction_test -echo "----------------------" -echo "running on-fault-limit" -echo "----------------------" -sudo -u nobody ./on-fault-limit -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test sudo -u nobody ./on-fault-limit -echo "--------------------" -echo "running map_populate" -echo "--------------------" -./map_populate -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./map_populate -echo "-------------------------" -echo "running mlock-random-test" -echo "-------------------------" -./mlock-random-test -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./mlock-random-test -echo "--------------------" -echo "running mlock2-tests" -echo "--------------------" -./mlock2-tests -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./mlock2-tests -echo "-------------------" -echo "running mremap_test" -echo "-------------------" -./mremap_test -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +run_test ./mremap_test -echo "-----------------" -echo "running thuge-gen" -echo "-----------------" -./thuge-gen -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi +run_test ./thuge-gen if [ $VADDR64 -ne 0 ]; then -echo "-----------------------------" -echo "running virtual_address_range" -echo "-----------------------------" -./virtual_address_range -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi + run_test ./virtual_address_range -echo "-----------------------------" -echo "running virtual address 128TB switch test" -echo "-----------------------------" -./va_128TBswitch -if [ $? -ne 0 ]; then - echo "[FAIL]" - exitcode=1 -else - echo "[PASS]" -fi + # virtual address 128TB switch test + run_test ./va_128TBswitch fi # VADDR64 -echo "------------------------------------" -echo "running vmalloc stability smoke test" -echo "------------------------------------" -./test_vmalloc.sh smoke -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "------------------------------------" -echo "running MREMAP_DONTUNMAP smoke test" -echo "------------------------------------" -./mremap_dontunmap -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "running HMM smoke test" -echo "------------------------------------" -./test_hmm.sh smoke -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "--------------------------------------------------------" -echo "running MADV_POPULATE_READ and MADV_POPULATE_WRITE tests" -echo "--------------------------------------------------------" -./madv_populate -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "running memfd_secret test" -echo "------------------------------------" -./memfd_secret -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "-------------------------------------------------------" -echo "running KSM MADV_MERGEABLE test with 10 identical pages" -echo "-------------------------------------------------------" -./ksm_tests -M -p 10 -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi - -echo "------------------------" -echo "running KSM unmerge test" -echo "------------------------" -./ksm_tests -U -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +# vmalloc stability smoke test +run_test ./test_vmalloc.sh smoke -echo "----------------------------------------------------------" -echo "running KSM test with 10 zero pages and use_zero_pages = 0" -echo "----------------------------------------------------------" -./ksm_tests -Z -p 10 -z 0 -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +run_test ./mremap_dontunmap -echo "----------------------------------------------------------" -echo "running KSM test with 10 zero pages and use_zero_pages = 1" -echo "----------------------------------------------------------" -./ksm_tests -Z -p 10 -z 1 -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +run_test ./test_hmm.sh smoke -echo "-------------------------------------------------------------" -echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1" -echo "-------------------------------------------------------------" -./ksm_tests -N -m 1 -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests +run_test ./madv_populate -echo "-------------------------------------------------------------" -echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0" -echo "-------------------------------------------------------------" -./ksm_tests -N -m 0 -ret_val=$? - -if [ $ret_val -eq 0 ]; then - echo "[PASS]" -elif [ $ret_val -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip -else - echo "[FAIL]" - exitcode=1 -fi +run_test ./memfd_secret -exit $exitcode +# KSM MADV_MERGEABLE test with 10 identical pages +run_test ./ksm_tests -M -p 10 +# KSM unmerge test +run_test ./ksm_tests -U +# KSM test with 10 zero pages and use_zero_pages = 0 +run_test ./ksm_tests -Z -p 10 -z 0 +# KSM test with 10 zero pages and use_zero_pages = 1 +run_test ./ksm_tests -Z -p 10 -z 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 1 +run_test ./ksm_tests -N -m 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 0 +run_test ./ksm_tests -N -m 0 exit $exitcode -- cgit v1.2.3 From 241ec63a9a0fbb39292ea1dd2d07f8dabedfe3df Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Thu, 28 Apr 2022 23:16:11 -0700 Subject: selftests: vm: fix shellcheck warnings in run_vmtests.sh These might not be issues yet, but they make the script more fragile. Also by fixing them we give a better example to future readers, who might copy/paste or otherwise re-use snippets from our script. - Use "read -r", since we don't ever want read to be interpreting '\' characters as escape sequences... - Quote variables, to deal with spaces properly. - Use $() instead of the older and harder-to-nest ``. - Get rid of superfluous "$" prefixes inside arithmetic $(()). Link: https://lkml.kernel.org/r/20220421224928.1848230-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 55 +++++++++++++++---------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 2d5a3da42cbe..a2302b5faaf2 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -9,12 +9,12 @@ mnt=./huge exitcode=0 #get huge pagesize and freepages from /proc/meminfo -while read name size unit; do +while read -r name size unit; do if [ "$name" = "HugePages_Free:" ]; then - freepgs=$size + freepgs="$size" fi if [ "$name" = "Hugepagesize:" ]; then - hpgsize_KB=$size + hpgsize_KB="$size" fi done < /proc/meminfo @@ -30,27 +30,26 @@ needmem_KB=$((half_ufd_size_MB * 2 * 1024)) #set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then - nr_hugepgs=`cat /proc/sys/vm/nr_hugepages` + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) needpgs=$((needmem_KB / hpgsize_KB)) tries=2 - while [ $tries -gt 0 ] && [ $freepgs -lt $needpgs ]; do - lackpgs=$(( $needpgs - $freepgs )) + while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do + lackpgs=$((needpgs - freepgs)) echo 3 > /proc/sys/vm/drop_caches - echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages - if [ $? -ne 0 ]; then + if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then echo "Please run this test as root" exit $ksft_skip fi - while read name size unit; do + while read -r name size unit; do if [ "$name" = "HugePages_Free:" ]; then freepgs=$size fi done < /proc/meminfo tries=$((tries - 1)) done - if [ $freepgs -lt $needpgs ]; then + if [ "$freepgs" -lt "$needpgs" ]; then printf "Not enough huge pages available (%d < %d)\n" \ - $freepgs $needpgs + "$freepgs" "$needpgs" exit 1 fi else @@ -60,11 +59,11 @@ fi #filter 64bit architectures ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" -if [ -z $ARCH ]; then - ARCH=`uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/'` +if [ -z "$ARCH" ]; then + ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') fi VADDR64=0 -echo "$ARCH64STR" | grep $ARCH && VADDR64=1 +echo "$ARCH64STR" | grep "$ARCH" && VADDR64=1 # Usage: run_test [test binary] [arbitrary test arguments...] run_test() { @@ -85,28 +84,28 @@ run_test() { fi } -mkdir $mnt -mount -t hugetlbfs none $mnt +mkdir "$mnt" +mount -t hugetlbfs none "$mnt" run_test ./hugepage-mmap -shmmax=`cat /proc/sys/kernel/shmmax` -shmall=`cat /proc/sys/kernel/shmall` +shmmax=$(cat /proc/sys/kernel/shmmax) +shmall=$(cat /proc/sys/kernel/shmall) echo 268435456 > /proc/sys/kernel/shmmax echo 4194304 > /proc/sys/kernel/shmall run_test ./hugepage-shm -echo $shmmax > /proc/sys/kernel/shmmax -echo $shmall > /proc/sys/kernel/shmall +echo "$shmmax" > /proc/sys/kernel/shmmax +echo "$shmall" > /proc/sys/kernel/shmall run_test ./map_hugetlb -run_test ./hugepage-mremap $mnt/huge_mremap -rm -f $mnt/huge_mremap +run_test ./hugepage-mremap "$mnt"/huge_mremap +rm -f "$mnt"/huge_mremap run_test ./hugepage-vmemmap -run_test ./hugetlb-madvise $mnt/madvise-test -rm -f $mnt/madvise-test +run_test ./hugetlb-madvise "$mnt"/madvise-test +rm -f "$mnt"/madvise-test echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" @@ -124,13 +123,13 @@ run_test ./gup_test -ct -F 0x1 0 19 0x1000 run_test ./userfaultfd anon 20 16 # Test requires source and destination huge pages. Size of source # (half_ufd_size_MB) is passed as argument to test. -run_test ./userfaultfd hugetlb $half_ufd_size_MB 32 +run_test ./userfaultfd hugetlb "$half_ufd_size_MB" 32 run_test ./userfaultfd shmem 20 16 #cleanup -umount $mnt -rm -rf $mnt -echo $nr_hugepgs > /proc/sys/vm/nr_hugepages +umount "$mnt" +rm -rf "$mnt" +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages run_test ./compaction_test -- cgit v1.2.3 From 325bca1fe0b1bb9f535e69bb9ec48d4a6e0ca3ce Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Thu, 28 Apr 2022 23:16:11 -0700 Subject: mm/mmap.c: use mmap_assert_write_locked() instead of open coding it In case the lock is actually not held at this point. Link: https://lkml.kernel.org/r/5827758.TJ1SttVevJ@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 313b57d55a63..042f01ec02fe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3553,7 +3553,7 @@ int mm_take_all_locks(struct mm_struct *mm) struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(mmap_read_trylock(mm)); + mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); @@ -3633,7 +3633,7 @@ void mm_drop_all_locks(struct mm_struct *mm) struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(mmap_read_trylock(mm)); + mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { -- cgit v1.2.3 From b191c9bc334a936775843867485c207e23b30e1b Mon Sep 17 00:00:00 2001 From: Jianxing Wang Date: Thu, 28 Apr 2022 23:16:12 -0700 Subject: mm/mmu_gather: limit free batch count and add schedule point in tlb_batch_pages_flush free a large list of pages maybe cause rcu_sched starved on non-preemptible kernels. howerver free_unref_page_list maybe can't cond_resched as it maybe called in interrupt or atomic context, especially can't detect atomic context in CONFIG_PREEMPTION=n. The issue is detected in guest with kvm cpu 200% overcommit, however I didn't see the warning in the host with the same application. I'm sure that the patch is needed for guest kernel, but no sure for host. To reproduce, set up two virtual machines in one host machine, per vm has the same number cpu and half memory of host. the run ltpstress.sh in per vm, then will see rcu stall warning.kernel is preempt disabled, append kernel command 'preempt=none' if enable dynamic preempt . It could detected in loongson machine(32 core, 128G mem) and ProLiant DL380 Gen9(x86 E5-2680, 28 core, 64G mem) tlb flush batch count depends on PAGE_SIZE, it's too large if PAGE_SIZE > 4K, here limit free batch count with 512. And add schedule point in tlb_batch_pages_flush. rcu: rcu_sched kthread starved for 5359 jiffies! g454793 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=19 [...] Call Trace: free_unref_page_list+0x19c/0x270 release_pages+0x3cc/0x498 tlb_flush_mmu_free+0x44/0x70 zap_pte_range+0x450/0x738 unmap_page_range+0x108/0x240 unmap_vmas+0x74/0xf0 unmap_region+0xb0/0x120 do_munmap+0x264/0x438 vm_munmap+0x58/0xa0 sys_munmap+0x10/0x20 syscall_common+0x24/0x38 Link: https://lkml.kernel.org/r/20220317072857.2635262-1-wangjianxing@loongson.cn Signed-off-by: Jianxing Wang Signed-off-by: Peter Zijlstra Cc: Will Deacon Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index afb7185ffdc4..a71924bd38c0 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -47,8 +47,20 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - free_pages_and_swap_cache(batch->pages, batch->nr); - batch->nr = 0; + struct page **pages = batch->pages; + + do { + /* + * limit free batch count when PAGE_SIZE > 4K + */ + unsigned int nr = min(512U, batch->nr); + + free_pages_and_swap_cache(pages, nr); + pages += nr; + batch->nr -= nr; + + cond_resched(); + } while (batch->nr); } tlb->active = &tlb->local; } -- cgit v1.2.3 From 31d17076b07c8ed212b1d865b36b0c7313885ef2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:12 -0700 Subject: mm/debug_vm_pgtable: drop protection_map[] usage Patch series "mm: protection_map[] cleanups". This patch (of 2): Although protection_map[] contains the platform defined page protection map for a given vm_flags combination, vm_get_page_prot() is the right interface to use. This will also reduce dependency on protection_map[] which is going to be dropped off completely later on. Link: https://lkml.kernel.org/r/20220404031840.588321-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20220404031840.588321-2-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index db2abd9e415b..30fd11a2ed32 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -93,7 +93,7 @@ struct pgtable_debug_args { static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) { - pgprot_t prot = protection_map[idx]; + pgprot_t prot = vm_get_page_prot(idx); pte_t pte = pfn_pte(args->fixed_pte_pfn, prot); unsigned long val = idx, *ptr = &val; @@ -101,7 +101,7 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) /* * This test needs to be executed after the given page table entry - * is created with pfn_pte() to make sure that protection_map[idx] + * is created with pfn_pte() to make sure that vm_get_page_prot(idx) * does not have the dirty bit enabled from the beginning. This is * important for platforms like arm64 where (!PTE_RDONLY) indicate * dirty bit being set. @@ -190,7 +190,7 @@ static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { - pgprot_t prot = protection_map[idx]; + pgprot_t prot = vm_get_page_prot(idx); unsigned long val = idx, *ptr = &val; pmd_t pmd; @@ -202,7 +202,7 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) /* * This test needs to be executed after the given page table entry - * is created with pfn_pmd() to make sure that protection_map[idx] + * is created with pfn_pmd() to make sure that vm_get_page_prot(idx) * does not have the dirty bit enabled from the beginning. This is * important for platforms like arm64 where (!PTE_RDONLY) indicate * dirty bit being set. @@ -325,7 +325,7 @@ static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { - pgprot_t prot = protection_map[idx]; + pgprot_t prot = vm_get_page_prot(idx); unsigned long val = idx, *ptr = &val; pud_t pud; @@ -337,7 +337,7 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) /* * This test needs to be executed after the given page table entry - * is created with pfn_pud() to make sure that protection_map[idx] + * is created with pfn_pud() to make sure that vm_get_page_prot(idx) * does not have the dirty bit enabled from the beginning. This is * important for platforms like arm64 where (!PTE_RDONLY) indicate * dirty bit being set. @@ -1106,14 +1106,14 @@ static int __init init_args(struct pgtable_debug_args *args) /* * Initialize the debugging data. * - * protection_map[0] (or even protection_map[8]) will help create - * page table entries with PROT_NONE permission as required for - * pxx_protnone_tests(). + * vm_get_page_prot(VM_NONE) or vm_get_page_prot(VM_SHARED|VM_NONE) + * will help create page table entries with PROT_NONE permission as + * required for pxx_protnone_tests(). */ memset(args, 0, sizeof(*args)); args->vaddr = get_random_vaddr(); args->page_prot = vm_get_page_prot(VMFLAGS); - args->page_prot_none = protection_map[0]; + args->page_prot_none = vm_get_page_prot(VM_NONE); args->is_contiguous_page = false; args->pud_pfn = ULONG_MAX; args->pmd_pfn = ULONG_MAX; @@ -1248,12 +1248,19 @@ static int __init debug_vm_pgtable(void) return ret; /* - * Iterate over the protection_map[] to make sure that all + * Iterate over each possible vm_flags to make sure that all * the basic page table transformation validations just hold * true irrespective of the starting protection value for a * given page table entry. + * + * Protection based vm_flags combinatins are always linear + * and increasing i.e starting from VM_NONE and going upto + * (VM_SHARED | READ | WRITE | EXEC). */ - for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) { +#define VM_FLAGS_START (VM_NONE) +#define VM_FLAGS_END (VM_SHARED | VM_EXEC | VM_WRITE | VM_READ) + + for (idx = VM_FLAGS_START; idx <= VM_FLAGS_END; idx++) { pte_basic_tests(&args, idx); pmd_basic_tests(&args, idx); pud_basic_tests(&args, idx); -- cgit v1.2.3 From 6c862bd05922af770c5b652037404c4f8131d984 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:12 -0700 Subject: mm/mmap: clarify protection_map[] indices protection_map[] maps vm_flags access combinations into page protection value as defined by the platform via __PXXX and __SXXX macros. The array indices in protection_map[], represents vm_flags access combinations but it's not very intuitive to derive. This makes it clear and explicit. Link: https://lkml.kernel.org/r/20220404031840.588321-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/mmap.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 042f01ec02fe..7873cc3ca24c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -102,8 +102,22 @@ static void unmap_region(struct mm_struct *mm, * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { - __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, - __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 + [VM_NONE] = __P000, + [VM_READ] = __P001, + [VM_WRITE] = __P010, + [VM_WRITE | VM_READ] = __P011, + [VM_EXEC] = __P100, + [VM_EXEC | VM_READ] = __P101, + [VM_EXEC | VM_WRITE] = __P110, + [VM_EXEC | VM_WRITE | VM_READ] = __P111, + [VM_SHARED] = __S000, + [VM_SHARED | VM_READ] = __S001, + [VM_SHARED | VM_WRITE] = __S010, + [VM_SHARED | VM_WRITE | VM_READ] = __S011, + [VM_SHARED | VM_EXEC] = __S100, + [VM_SHARED | VM_EXEC | VM_READ] = __S101, + [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 }; #ifndef CONFIG_ARCH_HAS_FILTER_PGPROT -- cgit v1.2.3 From c5d8a3643d91be748d7ff12eedc5876f32cc8283 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:12 -0700 Subject: mm/mmap.c: use helper mlock_future_check() Use helper mlock_future_check() to check whether it's safe to enlarge the locked_vm to simplify the code. Minor readability improvement. Link: https://lkml.kernel.org/r/20220402032231.64974-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/mmap.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 7873cc3ca24c..e32640456c92 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2354,15 +2354,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked; - unsigned long limit; - locked = mm->locked_vm + grow; - limit = rlimit(RLIMIT_MEMLOCK); - limit >>= PAGE_SHIFT; - if (locked > limit && !capable(CAP_IPC_LOCK)) - return -ENOMEM; - } + if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT)) + return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : -- cgit v1.2.3 From 67436193c2874c2e0e2d654820fadb9e79785335 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:12 -0700 Subject: mm/mmap: add new config ARCH_HAS_VM_GET_PAGE_PROT Patch series "mm/mmap: Drop arch_vm_get_page_prot() and arch_filter_pgprot()", v7. protection_map[] is an array based construct that translates given vm_flags combination. This array contains page protection map, which is populated by the platform via [__S000 .. __S111] and [__P000 .. __P111] exported macros. Primary usage for protection_map[] is for vm_get_page_prot(), which is used to determine page protection value for a given vm_flags. vm_get_page_prot() implementation, could again call platform overrides arch_vm_get_page_prot() and arch_filter_pgprot(). Some platforms override protection_map[] that was originally built with __SXXX/__PXXX with different runtime values. Currently there are multiple layers of abstraction i.e __SXXX/__PXXX macros , protection_map[], arch_vm_get_page_prot() and arch_filter_pgprot() built between the platform and generic MM, finally defining vm_get_page_prot(). Hence this series proposes to drop later two abstraction levels and instead just move the responsibility of defining vm_get_page_prot() to the platform (still utilizing generic protection_map[] array) itself making it clean and simple. This first introduces ARCH_HAS_VM_GET_PAGE_PROT which enables the platforms to define custom vm_get_page_prot(). This starts converting platforms that define the overrides arch_filter_pgprot() or arch_vm_get_page_prot() which enables for those constructs to be dropped off completely. The series has been inspired from an earlier discuss with Christoph Hellwig https://lore.kernel.org/all/1632712920-8171-1-git-send-email-anshuman.khandual@arm.com/ This patch (of 7): Add a new config ARCH_HAS_VM_GET_PAGE_PROT, which when subscribed enables a given platform to define its own vm_get_page_prot() but still utilizing the generic protection_map[] array. Link: https://lkml.kernel.org/r/20220414062125.609297-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20220414062125.609297-2-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Suggested-by: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/Kconfig | 3 +++ mm/mmap.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index 034d87953600..b1f7624276f8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -765,6 +765,9 @@ config ARCH_HAS_CURRENT_STACK_POINTER config ARCH_HAS_FILTER_PGPROT bool +config ARCH_HAS_VM_GET_PAGE_PROT + bool + config ARCH_HAS_PTE_DEVMAP bool diff --git a/mm/mmap.c b/mm/mmap.c index e32640456c92..c1376ce34316 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -120,6 +120,7 @@ pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 }; +#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT #ifndef CONFIG_ARCH_HAS_FILTER_PGPROT static inline pgprot_t arch_filter_pgprot(pgprot_t prot) { @@ -136,6 +137,7 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) return arch_filter_pgprot(ret); } EXPORT_SYMBOL(vm_get_page_prot); +#endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { -- cgit v1.2.3 From 634093c59a12fc90e33c4e5acd1da4498fd159d4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: powerpc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. While here, this also localizes arch_vm_get_page_prot() as __vm_get_page_prot() and moves it near vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Michael Ellerman Cc: Paul Mackerras Cc: Catalin Marinas Cc: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/mman.h | 12 ------------ arch/powerpc/mm/book3s64/pgtable.c | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 174edabb74fa..69e44358a235 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,6 +140,7 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 7cb6d18f5cd6..1b024e64c8ec 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -24,18 +24,6 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) -static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) -{ -#ifdef CONFIG_PPC_MEM_KEYS - return (vm_flags & VM_SAO) ? - __pgprot(_PAGE_SAO | vmflag_to_pte_pkey_bits(vm_flags)) : - __pgprot(0 | vmflag_to_pte_pkey_bits(vm_flags)); -#else - return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0); -#endif -} -#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) - static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_SAO)) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 052e6590f84f..8b474ab32f67 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -549,3 +550,19 @@ unsigned long memremap_compat_align(void) } EXPORT_SYMBOL_GPL(memremap_compat_align); #endif + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long prot = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + + if (vm_flags & VM_SAO) + prot |= _PAGE_SAO; + +#ifdef CONFIG_PPC_MEM_KEYS + prot |= vmflag_to_pte_pkey_bits(vm_flags); +#endif + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit v1.2.3 From b3aca728fb276782ab3c851aedaf3d19b33faabe Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: arm64/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. It localizes arch_vm_get_page_prot() and moves it near vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-4-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/mman.h | 24 ------------------------ arch/arm64/mm/mmap.c | 25 +++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2d7399ab0491..8da1f65ae398 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -45,6 +45,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index e3e28f7daf62..5966ee4a6154 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -35,30 +35,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags) } #define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags) -static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) -{ - pteval_t prot = 0; - - if (vm_flags & VM_ARM64_BTI) - prot |= PTE_GP; - - /* - * There are two conditions required for returning a Normal Tagged - * memory type: (1) the user requested it via PROT_MTE passed to - * mmap() or mprotect() and (2) the corresponding vma supports MTE. We - * register (1) as VM_MTE in the vma->vm_flags and (2) as - * VM_MTE_ALLOWED. Note that the latter can only be set during the - * mmap() call since mprotect() does not accept MAP_* flags. - * Checking for VM_MTE only is sufficient since arch_validate_flags() - * does not permit (VM_MTE & !VM_MTE_ALLOWED). - */ - if (vm_flags & VM_MTE) - prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); - - return __pgprot(prot); -} -#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) - static inline bool arch_validate_prot(unsigned long prot, unsigned long addr __always_unused) { diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 77ada00280d9..78e9490f748d 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -55,3 +55,28 @@ static int __init adjust_protection_map(void) return 0; } arch_initcall(adjust_protection_map); + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + pteval_t prot = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + + if (vm_flags & VM_ARM64_BTI) + prot |= PTE_GP; + + /* + * There are two conditions required for returning a Normal Tagged + * memory type: (1) the user requested it via PROT_MTE passed to + * mmap() or mprotect() and (2) the corresponding vma supports MTE. We + * register (1) as VM_MTE in the vma->vm_flags and (2) as + * VM_MTE_ALLOWED. Note that the latter can only be set during the + * mmap() call since mprotect() does not accept MAP_* flags. + * Checking for VM_MTE only is sufficient since arch_validate_flags() + * does not permit (VM_MTE & !VM_MTE_ALLOWED). + */ + if (vm_flags & VM_MTE) + prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit v1.2.3 From 91d4ce985fbb22c9a8ae50601f1ad4fe953b9382 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: sparc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. It localizes arch_vm_get_page_prot() as sparc_vm_get_page_prot() and moves near vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-5-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David S. Miller Reviewed-by: Khalid Aziz Reviewed-by: Christophe Leroy Cc: Khalid Aziz Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/mman.h | 6 ------ arch/sparc/mm/init_64.c | 12 ++++++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 9200bc04701c..85b573643af6 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -84,6 +84,7 @@ config SPARC64 select PERF_USE_VMALLOC select ARCH_HAVE_NMI_SAFE_CMPXCHG select HAVE_C_RECORDMCOUNT + select ARCH_HAS_VM_GET_PAGE_PROT select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h index 274217e7ed70..af9c10c83dc5 100644 --- a/arch/sparc/include/asm/mman.h +++ b/arch/sparc/include/asm/mman.h @@ -46,12 +46,6 @@ static inline unsigned long sparc_calc_vm_prot_bits(unsigned long prot) } } -#define arch_vm_get_page_prot(vm_flags) sparc_vm_get_page_prot(vm_flags) -static inline pgprot_t sparc_vm_get_page_prot(unsigned long vm_flags) -{ - return (vm_flags & VM_SPARC_ADI) ? __pgprot(_PAGE_MCD_4V) : __pgprot(0); -} - #define arch_validate_prot(prot, addr) sparc_validate_prot(prot, addr) static inline int sparc_validate_prot(unsigned long prot, unsigned long addr) { diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 8b1911591581..f6174df2d5af 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -3184,3 +3184,15 @@ void copy_highpage(struct page *to, struct page *from) } } EXPORT_SYMBOL(copy_highpage); + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long prot = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + + if (vm_flags & VM_SPARC_ADI) + prot |= _PAGE_MCD_4V; + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit v1.2.3 From e10cd4b00904db127b178859d81f6b5d05d16c67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: x86/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. This also unsubscribes from config ARCH_HAS_FILTER_PGPROT, after dropping off arch_filter_pgprot() and arch_vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-6-anshuman.khandual@arm.com Signed-off-by: Christoph Hellwig Signed-off-by: Anshuman Khandual Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/include/uapi/asm/mman.h | 14 -------------- arch/x86/mm/Makefile | 2 +- arch/x86/mm/pgprot.c | 35 +++++++++++++++++++++++++++++++++++ 5 files changed, 37 insertions(+), 21 deletions(-) create mode 100644 arch/x86/mm/pgprot.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ae4e1a79a1e2..5b77b4301d36 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -76,7 +76,6 @@ config X86 select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER - select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 @@ -95,6 +94,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 62ab07e24aef..3563f4645fa1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -649,11 +649,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - return canon_pgprot(prot); -} - static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index d4a8d0424bfb..775dbd3aff73 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,20 +5,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -/* - * Take the 4 protection key bits out of the vma->vm_flags - * value and turn them in to the bits that we can put in - * to a pte. - * - * Only override these if Protection Keys are available - * (which is only on 64-bit). - */ -#define arch_vm_get_page_prot(vm_flags) __pgprot( \ - ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \ - ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \ - ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \ - ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0)) - #define arch_calc_vm_prot_bits(prot, key) ( \ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fe3d3061fc11..fb6b41a48ae5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ - pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o pgprot.o obj-y += pat/ diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c new file mode 100644 index 000000000000..763742782286 --- /dev/null +++ b/arch/x86/mm/pgprot.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long val = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* + * Take the 4 protection key bits out of the vma->vm_flags value and + * turn them in to the bits that we can put in to a pte. + * + * Only override these if Protection Keys are available (which is only + * on 64-bit). + */ + if (vm_flags & VM_PKEY_BIT0) + val |= _PAGE_PKEY_BIT0; + if (vm_flags & VM_PKEY_BIT1) + val |= _PAGE_PKEY_BIT1; + if (vm_flags & VM_PKEY_BIT2) + val |= _PAGE_PKEY_BIT2; + if (vm_flags & VM_PKEY_BIT3) + val |= _PAGE_PKEY_BIT3; +#endif + + val = __sme_set(val); + if (val & _PAGE_PRESENT) + val &= __supported_pte_mask; + return __pgprot(val); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit v1.2.3 From 5dcfc6a1cc53ea0859de98a7380933f9e779859d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: mm/mmap: drop arch_filter_pgprot() There are no platforms left which subscribe ARCH_HAS_FILTER_PGPROT. Hence drop generic arch_filter_pgprot() and also config ARCH_HAS_FILTER_PGPROT. Link: https://lkml.kernel.org/r/20220414062125.609297-7-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/Kconfig | 3 --- mm/mmap.c | 13 ++----------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index b1f7624276f8..3f7b6d7b69df 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -762,9 +762,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. -config ARCH_HAS_FILTER_PGPROT - bool - config ARCH_HAS_VM_GET_PAGE_PROT bool diff --git a/mm/mmap.c b/mm/mmap.c index c1376ce34316..defc5af46973 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -121,20 +121,11 @@ pgprot_t protection_map[16] __ro_after_init = { }; #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - return prot; -} -#endif - pgprot_t vm_get_page_prot(unsigned long vm_flags) { - pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | + return __pgprot(pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | pgprot_val(arch_vm_get_page_prot(vm_flags))); - - return arch_filter_pgprot(ret); } EXPORT_SYMBOL(vm_get_page_prot); #endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ -- cgit v1.2.3 From 3afa793082e624f7bd83533010ff4a676451d4ee Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm/mmap: drop arch_vm_get_page_pgprot() There are no platforms left which use arch_vm_get_page_prot(). Just drop generic arch_vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-8-anshuman.khandual@arm.com Cc: Andrew Morton Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Signed-off-by: Anshuman Khandual Cc: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mman.h | 4 ---- mm/mmap.c | 4 +--- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index b66e91b8176c..58b3abd457a3 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -93,10 +93,6 @@ static inline void vm_unacct_memory(long pages) #define arch_calc_vm_flag_bits(flags) 0 #endif -#ifndef arch_vm_get_page_prot -#define arch_vm_get_page_prot(vm_flags) __pgprot(0) -#endif - #ifndef arch_validate_prot /* * This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have diff --git a/mm/mmap.c b/mm/mmap.c index defc5af46973..d7e120ad5825 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -123,9 +123,7 @@ pgprot_t protection_map[16] __ro_after_init = { #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT pgprot_t vm_get_page_prot(unsigned long vm_flags) { - return __pgprot(pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | - pgprot_val(arch_vm_get_page_prot(vm_flags))); + return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; } EXPORT_SYMBOL(vm_get_page_prot); #endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ -- cgit v1.2.3 From f433195679a9639a3af5be03bcb9c3d933e3b261 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm/mremap: use helper mlock_future_check() Use helper mlock_future_check() to check whether it's safe to resize the locked_vm to simplify the code. Minor readability improvement. Link: https://lkml.kernel.org/r/20220322112004.27380-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- mm/mremap.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 303d3290b938..f2a53d53f1b9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -766,14 +766,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return ERR_PTR(-EFAULT); - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = rlimit(RLIMIT_MEMLOCK); - locked += new_len - old_len; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return ERR_PTR(-EAGAIN); - } + if (mlock_future_check(mm, vma->vm_flags, new_len - old_len)) + return ERR_PTR(-EAGAIN); if (!may_expand_vm(mm, vma->vm_flags, (new_len - old_len) >> PAGE_SHIFT)) -- cgit v1.2.3 From 3c9fe8b8f5e3e30bfa81eeeff4bc34c0fb67b739 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm/mremap: avoid unneeded do_munmap call When old_len == new_len, do_munmap will return -EINVAL due to len == 0. This errno will be simply ignored because of old_len != new_len check. So it is unnecessary to call do_munmap when old_len == new_len because nothing is actually done. Link: https://lkml.kernel.org/r/20220401081023.37080-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/mremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index f2a53d53f1b9..98f50e633009 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -820,9 +820,9 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, goto out; } - if (old_len >= new_len) { + if (old_len > new_len) { ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); - if (ret && old_len != new_len) + if (ret) goto out; old_len = new_len; } -- cgit v1.2.3 From aa282a157bf8ff79bed9164dc5e0e99f0d9e9755 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm/page_alloc.c: calc the right pfn if page size is not 4K Previous 0x100000 is used to check the 4G limit in find_zone_movable_pfns_for_nodes(). This is right in x86 because the page size can only be 4K. But 16K and 64K are available in arm64. So replace it with PHYS_PFN(SZ_4G). Link: https://lkml.kernel.org/r/20220414101314.1250667-8-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Acked-by: Ard Biesheuvel Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2588d6226e0..298c7691fb1d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7821,7 +7821,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) usable_startpfn = memblock_region_memory_base_pfn(r); - if (usable_startpfn < 0x100000) { + if (usable_startpfn < PHYS_PFN(SZ_4G)) { mem_below_4gb_not_mirrored = true; continue; } -- cgit v1.2.3 From 5981611d0a006472d367d7a8e6ead8afaecf17c7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related functions Patch series "cleanup hugetlb_vmemmap". The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize" is more clear. In this series, cheanup related codes to make it more clear and expressive. This is suggested by David. This patch (of 3): The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". And some function names are prefixed with "huge_page" instead of "hugetlb", it is easily to be confused with THP. In this patch, cheanup related functions to make code more clear and expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 10 +++++----- mm/hugetlb_vmemmap.c | 42 ++++++++++++++++++++---------------------- mm/hugetlb_vmemmap.h | 20 ++++++++++---------- 4 files changed, 36 insertions(+), 38 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac2a1d758a80..189bddb8eca4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -624,7 +624,7 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34efd507a68d..87ba50b270a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1540,7 +1540,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1617,7 +1617,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&free_hpage_work); } @@ -1737,7 +1737,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_huge_page(struct hstate *h, struct page *page) { - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2110,7 +2110,7 @@ retry: * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw @@ -3425,7 +3425,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_page_for_demote(h, page, false); spin_unlock_irq(&hugetlb_lock); - rc = alloc_huge_page_vmemmap(h, page); + rc = hugetlb_vmemmap_alloc(h, page); if (rc) { /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 791626983c2e..91b79b9d9e25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, hugetlb_free_vmemmap_enabled_key); EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); -static int __init early_hugetlb_free_vmemmap_param(char *buf) +static int __init hugetlb_vmemmap_early_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ if (!is_power_of_2(sizeof(struct page))) { @@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return 0; } -early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); - -static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) -{ - return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; -} +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* * Previously discarded vmemmap pages will be allocated and remapping * after this function returns zero. */ -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { int ret; unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* * The pages which the vmemmap virtual address range [@vmemmap_addr, * @vmemmap_end) are mapped to are freed to the buddy allocator, and @@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) return ret; } -void free_huge_page_vmemmap(struct hstate *h, struct page *head) +void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - if (!free_vmemmap_pages_per_hpage(h)) + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + if (!vmemmap_pages) return; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) @@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) * hugetlbpage.rst for more details. */ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, - h->name); + pr_info("can optimize %d vmemmap pages for %s\n", + h->optimize_vmemmap_pages, h->name); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index cb2bef8f9e73..9760537849b5 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -11,25 +11,25 @@ #include #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); -void free_huge_page_vmemmap(struct hstate *h, struct page *head); +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); +void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); /* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. + * How many vmemmap pages associated with a HugeTLB page that can be + * optimized and freed to the buddy allocator. */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { - return h->nr_free_vmemmap_pages; + return h->optimize_vmemmap_pages; } #else -static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { return 0; } -static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { } @@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h) { } -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -- cgit v1.2.3 From f10f1442c309ccef7a80ba3dc4abde0978e86fb4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm: hugetlb_vmemmap: cleanup hugetlb_free_vmemmap_enabled* The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup the static key and hugetlb_free_vmemmap_enabled() to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- arch/arm64/mm/flush.c | 2 +- include/linux/page-flags.h | 12 ++++++------ mm/hugetlb_vmemmap.c | 10 +++++----- mm/memory_hotplug.c | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 840d998879ad..bbb2640f4f54 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -86,7 +86,7 @@ void flush_dcache_page(struct page *page) * is reused (more details can refer to the comments above * page_fixed_fake_head()). */ - if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) page = compound_head(page); if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9d8eeaa67d05..573f499d5a2d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -192,16 +192,16 @@ enum pageflags { #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); -static __always_inline bool hugetlb_free_vmemmap_enabled(void) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - &hugetlb_free_vmemmap_enabled_key); + &hugetlb_optimize_vmemmap_key); } /* - * If the feature of freeing some vmemmap pages associated with each HugeTLB + * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other @@ -218,7 +218,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void) */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return page; /* @@ -247,7 +247,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) return page; } -static inline bool hugetlb_free_vmemmap_enabled(void) +static inline bool hugetlb_optimize_vmemmap_enabled(void) { return false; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 91b79b9d9e25..f25294973398 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -189,8 +189,8 @@ #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { @@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - static_branch_enable(&hugetlb_free_vmemmap_enabled_key); + static_branch_enable(&hugetlb_optimize_vmemmap_key); else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); + static_branch_disable(&hugetlb_optimize_vmemmap_key); else return -EINVAL; @@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 416b38ca8def..678a449ec0f7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1289,7 +1289,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && - !hugetlb_free_vmemmap_enabled() && + !hugetlb_optimize_vmemmap_enabled() && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && -- cgit v1.2.3 From 47010c040dec8af6347ec6259104fc13f7e7e30a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP* The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: David Hildenbrand Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- Documentation/admin-guide/mm/hugetlbpage.rst | 2 +- arch/arm64/Kconfig | 2 +- arch/arm64/mm/flush.c | 2 +- arch/x86/Kconfig | 2 +- arch/x86/mm/init_64.c | 2 +- fs/Kconfig | 16 ++++++++-------- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 2 +- include/linux/page-flags.h | 6 +++--- mm/Makefile | 2 +- mm/hugetlb_vmemmap.c | 4 ++-- mm/hugetlb_vmemmap.h | 4 ++-- mm/sparse-vmemmap.c | 4 ++-- 14 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..d9b577db6e19 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1660,7 +1660,7 @@ Format: size[KMG] hugetlb_free_vmemmap= - [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). @@ -1669,7 +1669,7 @@ on: enable the feature off: disable the feature - Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y, + Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. This is not compatible with memory_hotplug.memmap_on_memory. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 0166f9de3428..a90330d0a837 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,7 +164,7 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing unused vmemmap pages associated with each HugeTLB page. When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8da1f65ae398..cc0498652bae 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -97,7 +97,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_NO_INSTR select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index bbb2640f4f54..fc4f710e9820 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -78,7 +78,7 @@ void flush_dcache_page(struct page *page) /* * Only the head page's flags of HugeTLB can be cleared since the tail * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more * details can refer to vmemmap_remap_pte()). Although * __sync_icache_dcache() only set PG_dcache_clean flag on the head * page struct, there is more than one page struct with PG_dcache_clean diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5b77b4301d36..da07c8087e5a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,7 +121,7 @@ config X86 select ARCH_WANTS_NO_INSTR select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96d34ebb20a9..af34dd1510d6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1269,7 +1269,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) diff --git a/fs/Kconfig b/fs/Kconfig index 8f6ab1cf5115..5976eb33535f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -250,22 +250,22 @@ config HUGETLB_PAGE # to enable the feature of minimizing overhead of struct page associated with # each HugeTLB page. # -config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool -config HUGETLB_PAGE_FREE_VMEMMAP +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config MEMFD_CREATE diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 189bddb8eca4..ac2ece9e9c79 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -623,7 +623,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB diff --git a/include/linux/mm.h b/include/linux/mm.h index 16d6b46d2d3e..a8f4c7e96ad5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3145,7 +3145,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 573f499d5a2d..1ea896887ee4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,13 +190,13 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { - return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, &hugetlb_optimize_vmemmap_key); } diff --git a/mm/Makefile b/mm/Makefile index 4cc13f3179a5..6f9ffa968a1a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -77,7 +77,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f25294973398..2655434a946b 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,7 +188,7 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); @@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) /* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9760537849b5..109b0a53b6fe 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,7 +10,7 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); @@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 8aecd6b3896c..52f36527bab3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,7 +34,7 @@ #include #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map -- cgit v1.2.3 From e3246d8f52173a798710314a42fea83223036fc8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm/sparse-vmemmap: add a pgmap argument to section activation Patch series "sparse-vmemmap: memory savings for compound devmaps (device-dax)", v9. This series minimizes 'struct page' overhead by pursuing a similar approach as Muchun Song series "Free some vmemmap pages of hugetlb page" (now merged since v5.14), but applied to devmap with @vmemmap_shift (device-dax). The vmemmap dedpulication original idea (already used in HugeTLB) is to reuse/deduplicate tail page vmemmap areas, particular the area which only describes tail pages. So a vmemmap page describes 64 struct pages, and the first page for a given ZONE_DEVICE vmemmap would contain the head page and 63 tail pages. The second vmemmap page would contain only tail pages, and that's what gets reused across the rest of the subsection/section. The bigger the page size, the bigger the savings (2M hpage -> save 6 vmemmap pages; 1G hpage -> save 4094 vmemmap pages). This is done for PMEM /specifically only/ on device-dax configured namespaces, not fsdax. In other words, a devmap with a @vmemmap_shift. In terms of savings, per 1Tb of memory, the struct page cost would go down with compound devmap: * with 2M pages we lose 4G instead of 16G (0.39% instead of 1.5% of total memory) * with 1G pages we lose 40MB instead of 16G (0.0014% instead of 1.5% of total memory) The series is mostly summed up by patch 4, and to summarize what the series does: Patches 1 - 3: Minor cleanups in preparation for patch 4. Move the very nice docs of hugetlb_vmemmap.c into a Documentation/vm/ entry. Patch 4: Patch 4 is the one that takes care of the struct page savings (also referred to here as tail-page/vmemmap deduplication). Much like Muchun series, we reuse the second PTE tail page vmemmap areas across a given @vmemmap_shift On important difference though, is that contrary to the hugetlbfs series, there's no vmemmap for the area because we are late-populating it as opposed to remapping a system-ram range. IOW no freeing of pages of already initialized vmemmap like the case for hugetlbfs, which greatly simplifies the logic (besides not being arch-specific). altmap case unchanged and still goes via the vmemmap_populate(). Also adjust the newly added docs to the device-dax case. [Note that device-dax is still a little behind HugeTLB in terms of savings. I have an additional simple patch that reuses the head vmemmap page too, as a follow-up. That will double the savings and namespaces initialization.] Patch 5: Initialize fewer struct pages depending on the page size with DRAM backed struct pages -- because fewer pages are unique and most tail pages (with bigger vmemmap_shift). NVDIMM namespace bootstrap improves from ~268-358 ms to ~80-110/<1ms on 128G NVDIMMs with 2M and 1G respectivally. And struct page needed capacity will be 3.8x / 1071x smaller for 2M and 1G respectivelly. Tested on x86 with 1.5Tb of pmem (including pinning, and RDMA registration/deregistration scalability with 2M MRs) This patch (of 5): In support of using compound pages for devmap mappings, plumb the pgmap down to the vmemmap_populate implementation. Note that while altmap is retrievable from pgmap the memory hotplug code passes altmap without pgmap[*], so both need to be independently plumbed. So in addition to @altmap, pass @pgmap to sparse section populate functions namely: sparse_add_section section_activate populate_section_memmap __populate_section_memmap Passing @pgmap allows __populate_section_memmap() to both fetch the vmemmap_shift in which memmap metadata is created for and also to let sparse-vmemmap fetch pgmap ranges to co-relate to a given section and pick whether to just reuse tail pages from past onlined sections. While at it, fix the kdoc for @altmap for sparse_add_section(). [*] https://lore.kernel.org/linux-mm/20210319092635.6214-1-osalvador@suse.de/ Link: https://lkml.kernel.org/r/20220420155310.9712-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20220420155310.9712-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Reviewed-by: Muchun Song Cc: Vishal Verma Cc: Matthew Wilcox Cc: Jason Gunthorpe Cc: Jane Chu Cc: Mike Kravetz Cc: Jonathan Corbet Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 ++++- include/linux/mm.h | 3 ++- mm/memory_hotplug.c | 3 ++- mm/sparse-vmemmap.c | 3 ++- mm/sparse.c | 26 ++++++++++++++++---------- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1ce6f8044f1e..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -15,6 +15,7 @@ struct memory_block; struct memory_group; struct resource; struct vmem_altmap; +struct dev_pagemap; #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION /* @@ -122,6 +123,7 @@ typedef int __bitwise mhp_t; struct mhp_params { struct vmem_altmap *altmap; pgprot_t pgprot; + struct dev_pagemap *pgmap; }; bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); @@ -333,7 +335,8 @@ extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index a8f4c7e96ad5..80bba49387e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3154,7 +3154,8 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap); + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 678a449ec0f7..19a6beb9d4b6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -328,7 +328,8 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); - err = sparse_add_section(nid, pfn, cur_nr_pages, altmap); + err = sparse_add_section(nid, pfn, cur_nr_pages, altmap, + params->pgmap); if (err) break; cond_resched(); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 52f36527bab3..fb68e7764ba2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -641,7 +641,8 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, } struct page * __meminit __populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap) + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); diff --git a/mm/sparse.c b/mm/sparse.c index 952f06d8f373..d2d76d158b39 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -427,7 +427,8 @@ static unsigned long __init section_map_size(void) } struct page __init *__populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap) + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long size = section_map_size(); struct page *map = sparse_buffer_alloc(size); @@ -524,7 +525,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, break; map = __populate_section_memmap(pfn, PAGES_PER_SECTION, - nid, NULL); + nid, NULL, NULL); if (!map) { pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); @@ -629,9 +630,10 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap) + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - return __populate_section_memmap(pfn, nr_pages, nid, altmap); + return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); } static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, @@ -700,7 +702,8 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) } #else struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap) + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { return kvmalloc_node(array_size(sizeof(struct page), PAGES_PER_SECTION), GFP_KERNEL, nid); @@ -823,7 +826,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, } static struct page * __meminit section_activate(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); struct mem_section_usage *usage = NULL; @@ -855,7 +859,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, if (nr_pages < PAGES_PER_SECTION && early_section(ms)) return pfn_to_page(pfn); - memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); if (!memmap) { section_deactivate(pfn, nr_pages, altmap); return ERR_PTR(-ENOMEM); @@ -869,7 +873,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, * @nid: The node to add section on * @start_pfn: start pfn of the memory range * @nr_pages: number of pfns to add in the section - * @altmap: device page map + * @altmap: alternate pfns to allocate the memmap backing store + * @pgmap: alternate compound page geometry for devmap mappings * * This is only intended for hotplug. * @@ -883,7 +888,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, * * -ENOMEM - Out of memory. */ int __meminit sparse_add_section(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct mem_section *ms; @@ -894,7 +900,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, if (ret < 0) return ret; - memmap = section_activate(nid, start_pfn, nr_pages, altmap); + memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); if (IS_ERR(memmap)) return PTR_ERR(memmap); -- cgit v1.2.3 From 2beea70a3edc03608ecc89b13ba9ba669c56b3fd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm/sparse-vmemmap: refactor core of vmemmap_populate_basepages() to helper In preparation for describing a memmap with compound pages, move the actual pte population logic into a separate function vmemmap_populate_address() and have a new helper vmemmap_populate_range() walk through all base pages it needs to populate. While doing that, change the helper to use a pte_t* as return value, rather than an hardcoded errno of 0 or -ENOMEM. Link: https://lkml.kernel.org/r/20220420155310.9712-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Cc: Christoph Hellwig Cc: Dan Williams Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 53 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index fb68e7764ba2..ef15664c6b6c 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -608,38 +608,57 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) return pgd; } -int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, - int node, struct vmem_altmap *altmap) +static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap) { - unsigned long addr = start; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return NULL; + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return NULL; + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return NULL; + pmd = vmemmap_pmd_populate(pud, addr, node); + if (!pmd) + return NULL; + pte = vmemmap_pte_populate(pmd, addr, node, altmap); + if (!pte) + return NULL; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static int __meminit vmemmap_populate_range(unsigned long start, + unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr = start; + pte_t *pte; + for (; addr < end; addr += PAGE_SIZE) { - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) - return -ENOMEM; - p4d = vmemmap_p4d_populate(pgd, addr, node); - if (!p4d) - return -ENOMEM; - pud = vmemmap_pud_populate(p4d, addr, node); - if (!pud) - return -ENOMEM; - pmd = vmemmap_pmd_populate(pud, addr, node); - if (!pmd) - return -ENOMEM; - pte = vmemmap_pte_populate(pmd, addr, node, altmap); + pte = vmemmap_populate_address(addr, node, altmap); if (!pte) return -ENOMEM; - vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); } return 0; } +int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + return vmemmap_populate_range(start, end, node, altmap); +} + struct page * __meminit __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) -- cgit v1.2.3 From 60a427db0f80f16b9bb9efe6cc79c93f336e8466 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm/hugetlb_vmemmap: move comment block to Documentation/vm In preparation for device-dax for using hugetlbfs compound page tail deduplication technique, move the comment block explanation into a common place in Documentation/vm. Link: https://lkml.kernel.org/r/20220420155310.9712-4-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Reviewed-by: Dan Williams Suggested-by: Dan Williams Cc: Muchun Song Cc: Mike Kravetz Cc: Christoph Hellwig Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Vishal Verma Signed-off-by: Andrew Morton --- Documentation/vm/index.rst | 1 + Documentation/vm/vmemmap_dedup.rst | 173 +++++++++++++++++++++++++++++++++++++ mm/hugetlb_vmemmap.c | 168 +---------------------------------- 3 files changed, 175 insertions(+), 167 deletions(-) create mode 100644 Documentation/vm/vmemmap_dedup.rst diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index 44365c4574a3..2fb612bb72c9 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -37,5 +37,6 @@ algorithms. If you are looking for advice on simply allocating memory, see the transhuge unevictable-lru vmalloced-kernel-stacks + vmemmap_dedup z3fold zsmalloc diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst new file mode 100644 index 000000000000..485ccf4f7b10 --- /dev/null +++ b/Documentation/vm/vmemmap_dedup.rst @@ -0,0 +1,173 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Free some vmemmap pages of HugeTLB +================================== + +The struct page structures (page structs) are used to describe a physical +page frame. By default, there is a one-to-one mapping from a page frame to +it's corresponding page struct. + +HugeTLB pages consist of multiple base page size pages and is supported by many +architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more +details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are +currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page +consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. +For each base page, there is a corresponding page struct. + +Within the HugeTLB subsystem, only the first 4 page structs are used to +contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides +this upper limit. The only 'useful' information in the remaining page structs +is the compound_head field, and this field is the same for all tail pages. + +By removing redundant page structs for HugeTLB pages, memory can be returned +to the buddy allocator for other uses. + +Different architectures support different HugeTLB pages. For example, the +following table is the HugeTLB page size supported by x86 and arm64 +architectures. Because arm64 supports 4k, 16k, and 64k base pages and +supports contiguous entries, so it supports many kinds of sizes of HugeTLB +page. + ++--------------+-----------+-----------------------------------------------+ +| Architecture | Page Size | HugeTLB Page Size | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| x86-64 | 4KB | 2MB | 1GB | | | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| | 4KB | 64KB | 2MB | 32MB | 1GB | +| +-----------+-----------+-----------+-----------+-----------+ +| arm64 | 16KB | 2MB | 32MB | 1GB | | +| +-----------+-----------+-----------+-----------+-----------+ +| | 64KB | 2MB | 512MB | 16GB | | ++--------------+-----------+-----------+-----------+-----------+-----------+ + +When the system boot up, every HugeTLB page has more than one struct page +structs which size is (unit: pages):: + + struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + +Where HugeTLB_Size is the size of the HugeTLB page. We know that the size +of the HugeTLB page is always n times PAGE_SIZE. So we can get the following +relationship:: + + HugeTLB_Size = n * PAGE_SIZE + +Then:: + + struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + = n * sizeof(struct page) / PAGE_SIZE + +We can use huge mapping at the pud/pmd level for the HugeTLB page. + +For the HugeTLB page of the pmd level mapping, then:: + + struct_size = n * sizeof(struct page) / PAGE_SIZE + = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE + = sizeof(struct page) / sizeof(pte_t) + = 64 / 8 + = 8 (pages) + +Where n is how many pte entries which one page can contains. So the value of +n is (PAGE_SIZE / sizeof(pte_t)). + +This optimization only supports 64-bit system, so the value of sizeof(pte_t) +is 8. And this optimization also applicable only when the size of struct page +is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the +size of struct page structs of it is 8 page frames which size depends on the +size of the base page. + +For the HugeTLB page of the pud level mapping, then:: + + struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) + = PAGE_SIZE / 8 * 8 (pages) + = PAGE_SIZE (pages) + +Where the struct_size(pmd) is the size of the struct page structs of a +HugeTLB page of the pmd level mapping. + +E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB +HugeTLB page consists in 4096. + +Next, we take the pmd level mapping of the HugeTLB page as an example to +show the internal implementation of this optimization. There are 8 pages +struct page structs associated with a HugeTLB page which is pmd mapped. + +Here is how things look before optimization:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | -------------> | 2 | + | | +-----------+ +-----------+ + | | | 3 | -------------> | 3 | + | | +-----------+ +-----------+ + | | | 4 | -------------> | 4 | + | PMD | +-----------+ +-----------+ + | level | | 5 | -------------> | 5 | + | mapping | +-----------+ +-----------+ + | | | 6 | -------------> | 6 | + | | +-----------+ +-----------+ + | | | 7 | -------------> | 7 | + | | +-----------+ +-----------+ + | | + | | + | | + +-----------+ + +The value of page->compound_head is the same for all tail pages. The first +page of page structs (page 0) associated with the HugeTLB page contains the 4 +page structs necessary to describe the HugeTLB. The only use of the remaining +pages of page structs (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +will be used for each HugeTLB page. This will allow us to free the remaining +7 pages to the buddy allocator. + +Here is how things look after remapping:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | | + | | | 2 | -----------------+ | | | | | + | | +-----------+ | | | | | + | | | 3 | -------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | ---------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | -----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | -------------------------+ | + | | +-----------+ | + | | | 7 | ---------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +When a HugeTLB is freed to the buddy system, we should allocate 7 pages for +vmemmap pages and restore the previous mapping relationship. + +For the HugeTLB page of the pud level mapping. It is similar to the former. +We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. + +Apart from the HugeTLB page of the pmd/pud level mapping, some architectures +(e.g. aarch64) provides a contiguous bit in the translation table entries +that hints to the MMU to indicate that it is one of a contiguous set of +entries that can be cached in a single TLB entry. + +The contiguous bit is used to increase the mapping size at the pmd and pte +(last) level. So this type of HugeTLB page can be optimized only when its +size of the struct page structs is greater than 1 page. + +Notice: The head vmemmap page is not freed to the buddy allocator and all +tail vmemmap pages are mapped to the head vmemmap page frame. So we can see +more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) +associated with each HugeTLB page. The compound_head() can handle this +correctly (more details refer to the comment above compound_head()). diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2655434a946b..29554c6ef2ae 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -6,173 +6,7 @@ * * Author: Muchun Song * - * The struct page structures (page structs) are used to describe a physical - * page frame. By default, there is a one-to-one mapping from a page frame to - * it's corresponding page struct. - * - * HugeTLB pages consist of multiple base page size pages and is supported by - * many architectures. See hugetlbpage.rst in the Documentation directory for - * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB - * are currently supported. Since the base page size on x86 is 4KB, a 2MB - * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of - * 4096 base pages. For each base page, there is a corresponding page struct. - * - * Within the HugeTLB subsystem, only the first 4 page structs are used to - * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides - * this upper limit. The only 'useful' information in the remaining page structs - * is the compound_head field, and this field is the same for all tail pages. - * - * By removing redundant page structs for HugeTLB pages, memory can be returned - * to the buddy allocator for other uses. - * - * Different architectures support different HugeTLB pages. For example, the - * following table is the HugeTLB page size supported by x86 and arm64 - * architectures. Because arm64 supports 4k, 16k, and 64k base pages and - * supports contiguous entries, so it supports many kinds of sizes of HugeTLB - * page. - * - * +--------------+-----------+-----------------------------------------------+ - * | Architecture | Page Size | HugeTLB Page Size | - * +--------------+-----------+-----------+-----------+-----------+-----------+ - * | x86-64 | 4KB | 2MB | 1GB | | | - * +--------------+-----------+-----------+-----------+-----------+-----------+ - * | | 4KB | 64KB | 2MB | 32MB | 1GB | - * | +-----------+-----------+-----------+-----------+-----------+ - * | arm64 | 16KB | 2MB | 32MB | 1GB | | - * | +-----------+-----------+-----------+-----------+-----------+ - * | | 64KB | 2MB | 512MB | 16GB | | - * +--------------+-----------+-----------+-----------+-----------+-----------+ - * - * When the system boot up, every HugeTLB page has more than one struct page - * structs which size is (unit: pages): - * - * struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - * - * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size - * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following - * relationship. - * - * HugeTLB_Size = n * PAGE_SIZE - * - * Then, - * - * struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - * = n * sizeof(struct page) / PAGE_SIZE - * - * We can use huge mapping at the pud/pmd level for the HugeTLB page. - * - * For the HugeTLB page of the pmd level mapping, then - * - * struct_size = n * sizeof(struct page) / PAGE_SIZE - * = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE - * = sizeof(struct page) / sizeof(pte_t) - * = 64 / 8 - * = 8 (pages) - * - * Where n is how many pte entries which one page can contains. So the value of - * n is (PAGE_SIZE / sizeof(pte_t)). - * - * This optimization only supports 64-bit system, so the value of sizeof(pte_t) - * is 8. And this optimization also applicable only when the size of struct page - * is a power of two. In most cases, the size of struct page is 64 bytes (e.g. - * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the - * size of struct page structs of it is 8 page frames which size depends on the - * size of the base page. - * - * For the HugeTLB page of the pud level mapping, then - * - * struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) - * = PAGE_SIZE / 8 * 8 (pages) - * = PAGE_SIZE (pages) - * - * Where the struct_size(pmd) is the size of the struct page structs of a - * HugeTLB page of the pmd level mapping. - * - * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB - * HugeTLB page consists in 4096. - * - * Next, we take the pmd level mapping of the HugeTLB page as an example to - * show the internal implementation of this optimization. There are 8 pages - * struct page structs associated with a HugeTLB page which is pmd mapped. - * - * Here is how things look before optimization. - * - * HugeTLB struct pages(8 pages) page frame(8 pages) - * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - * | | | 0 | -------------> | 0 | - * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | -------------> | 2 | - * | | +-----------+ +-----------+ - * | | | 3 | -------------> | 3 | - * | | +-----------+ +-----------+ - * | | | 4 | -------------> | 4 | - * | PMD | +-----------+ +-----------+ - * | level | | 5 | -------------> | 5 | - * | mapping | +-----------+ +-----------+ - * | | | 6 | -------------> | 6 | - * | | +-----------+ +-----------+ - * | | | 7 | -------------> | 7 | - * | | +-----------+ +-----------+ - * | | - * | | - * | | - * +-----------+ - * - * The value of page->compound_head is the same for all tail pages. The first - * page of page structs (page 0) associated with the HugeTLB page contains the 4 - * page structs necessary to describe the HugeTLB. The only use of the remaining - * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs - * will be used for each HugeTLB page. This will allow us to free the remaining - * 7 pages to the buddy allocator. - * - * Here is how things look after remapping. - * - * HugeTLB struct pages(8 pages) page frame(8 pages) - * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - * | | | 0 | -------------> | 0 | - * | | +-----------+ +-----------+ - * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | | - * | | | 2 | -----------------+ | | | | | - * | | +-----------+ | | | | | - * | | | 3 | -------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | ---------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | -----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | -------------------------+ | - * | | +-----------+ | - * | | | 7 | ---------------------------+ - * | | +-----------+ - * | | - * | | - * | | - * +-----------+ - * - * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for - * vmemmap pages and restore the previous mapping relationship. - * - * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. - * - * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures - * (e.g. aarch64) provides a contiguous bit in the translation table entries - * that hints to the MMU to indicate that it is one of a contiguous set of - * entries that can be cached in a single TLB entry. - * - * The contiguous bit is used to increase the mapping size at the pmd and pte - * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 1 page. - * - * Notice: The head vmemmap page is not freed to the buddy allocator and all - * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see - * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) - * associated with each HugeTLB page. The compound_head() can handle this - * correctly (more details refer to the comment above compound_head()). + * See Documentation/vm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt -- cgit v1.2.3 From 4917f55b4ef963e2d2288fe4eb651728be8db406 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: mm/sparse-vmemmap: improve memory savings for compound devmaps A compound devmap is a dev_pagemap with @vmemmap_shift > 0 and it means that pages are mapped at a given huge page alignment and utilize uses compound pages as opposed to order-0 pages. Take advantage of the fact that most tail pages look the same (except the first two) to minimize struct page overhead. Allocate a separate page for the vmemmap area which contains the head page and separate for the next 64 pages. The rest of the subsections then reuse this tail vmemmap page to initialize the rest of the tail pages. Sections are arch-dependent (e.g. on x86 it's 64M, 128M or 512M) and when initializing compound devmap with big enough @vmemmap_shift (e.g. 1G PUD) it may cross multiple sections. The vmemmap code needs to consult @pgmap so that multiple sections that all map the same tail data can refer back to the first copy of that data for a given gigantic page. On compound devmaps with 2M align, this mechanism lets 6 pages be saved out of the 8 necessary PFNs necessary to set the subsection's 512 struct pages being mapped. On a 1G compound devmap it saves 4094 pages. Altmap isn't supported yet, given various restrictions in altmap pfn allocator, thus fallback to the already in use vmemmap_populate(). It is worth noting that altmap for devmap mappings was there to relieve the pressure of inordinate amounts of memmap space to map terabytes of pmem. With compound pages the motivation for altmaps for pmem gets reduced. Link: https://lkml.kernel.org/r/20220420155310.9712-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Cc: Christoph Hellwig Cc: Dan Williams Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Vishal Verma Signed-off-by: Andrew Morton --- Documentation/vm/vmemmap_dedup.rst | 56 +++++++++++++++- include/linux/mm.h | 2 +- mm/memremap.c | 1 + mm/sparse-vmemmap.c | 132 ++++++++++++++++++++++++++++++++++--- 4 files changed, 177 insertions(+), 14 deletions(-) diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst index 485ccf4f7b10..c9c495f62d12 100644 --- a/Documentation/vm/vmemmap_dedup.rst +++ b/Documentation/vm/vmemmap_dedup.rst @@ -1,8 +1,11 @@ .. SPDX-License-Identifier: GPL-2.0 -================================== -Free some vmemmap pages of HugeTLB -================================== +========================================= +A vmemmap diet for HugeTLB and Device DAX +========================================= + +HugeTLB +======= The struct page structures (page structs) are used to describe a physical page frame. By default, there is a one-to-one mapping from a page frame to @@ -171,3 +174,50 @@ tail vmemmap pages are mapped to the head vmemmap page frame. So we can see more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) associated with each HugeTLB page. The compound_head() can handle this correctly (more details refer to the comment above compound_head()). + +Device DAX +========== + +The device-dax interface uses the same tail deduplication technique explained +in the previous chapter, except when used with the vmemmap in +the device (altmap). + +The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), +PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). + +The differences with HugeTLB are relatively minor. + +It only use 3 page structs for storing all information as opposed +to 4 on HugeTLB pages. + +There's no remapping of vmemmap given that device-dax memory is not part of +System RAM ranges initialized at boot. Thus the tail page deduplication +happens at a later stage when we populate the sections. HugeTLB reuses the +the head vmemmap page representing, whereas device-dax reuses the tail +vmemmap page. This results in only half of the savings compared to HugeTLB. + +Deduplicated tail pages are not mapped read-only. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | ----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | ------------------------+ | + | | +-----------+ | + | | | 7 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bba49387e9..b9316b2c11ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3161,7 +3161,7 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, struct page *reuse); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, diff --git a/mm/memremap.c b/mm/memremap.c index af0223605e69..c33bcd0398c9 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -287,6 +287,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { struct mhp_params params = { .altmap = pgmap_altmap(pgmap), + .pgmap = pgmap, .pgprot = PAGE_KERNEL, }; const int nr_range = pgmap->nr_range; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index ef15664c6b6c..f4fa61dbbee3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -533,16 +533,31 @@ void __meminit vmemmap_verify(pte_t *pte, int node, } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, + struct page *reuse) { pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; void *p; - p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); - if (!p) - return NULL; + if (!reuse) { + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p) + return NULL; + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + } entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); set_pte_at(&init_mm, addr, pte, entry); } @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) } static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, + struct page *reuse) { pgd_t *pgd; p4d_t *p4d; @@ -629,7 +645,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, pmd = vmemmap_pmd_populate(pud, addr, node); if (!pmd) return NULL; - pte = vmemmap_pte_populate(pmd, addr, node, altmap); + pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse); if (!pte) return NULL; vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); @@ -639,13 +655,14 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, static int __meminit vmemmap_populate_range(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, + struct page *reuse) { unsigned long addr = start; pte_t *pte; for (; addr < end; addr += PAGE_SIZE) { - pte = vmemmap_populate_address(addr, node, altmap); + pte = vmemmap_populate_address(addr, node, altmap, reuse); if (!pte) return -ENOMEM; } @@ -656,7 +673,95 @@ static int __meminit vmemmap_populate_range(unsigned long start, int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - return vmemmap_populate_range(start, end, node, altmap); + return vmemmap_populate_range(start, end, node, altmap, NULL); +} + +/* + * For compound pages bigger than section size (e.g. x86 1G compound + * pages with 2M subsection size) fill the rest of sections as tail + * pages. + * + * Note that memremap_pages() resets @nr_range value and will increment + * it after each range successful onlining. Thus the value or @nr_range + * at section memmap populate corresponds to the in-progress range + * being onlined here. + */ +static bool __meminit reuse_compound_section(unsigned long start_pfn, + struct dev_pagemap *pgmap) +{ + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long offset = start_pfn - + PHYS_PFN(pgmap->ranges[pgmap->nr_range].start); + + return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION; +} + +static pte_t * __meminit compound_section_tail_page(unsigned long addr) +{ + pte_t *pte; + + addr -= PAGE_SIZE; + + /* + * Assuming sections are populated sequentially, the previous section's + * page data can be reused. + */ + pte = pte_offset_kernel(pmd_off_k(addr), addr); + if (!pte) + return NULL; + + return pte; +} + +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + unsigned long size, addr; + pte_t *pte; + int rc; + + if (reuse_compound_section(start_pfn, pgmap)) { + pte = compound_section_tail_page(start); + if (!pte) + return -ENOMEM; + + /* + * Reuse the page that was populated in the prior iteration + * with just tail struct pages. + */ + return vmemmap_populate_range(start, end, node, NULL, + pte_page(*pte)); + } + + size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); + for (addr = start; addr < end; addr += size) { + unsigned long next = addr, last = addr + size; + + /* Populate the head page vmemmap page */ + pte = vmemmap_populate_address(addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + /* Populate the tail pages vmemmap page */ + next = addr + PAGE_SIZE; + pte = vmemmap_populate_address(next, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + /* + * Reuse the previous page for the rest of tail pages + * See layout diagram in Documentation/vm/vmemmap_dedup.rst + */ + next += PAGE_SIZE; + rc = vmemmap_populate_range(next, last, node, NULL, + pte_page(*pte)); + if (rc) + return -ENOMEM; + } + + return 0; } struct page * __meminit __populate_section_memmap(unsigned long pfn, @@ -665,12 +770,19 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, { unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); + int r; if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) || !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION))) return NULL; - if (vmemmap_populate(start, end, nid, altmap)) + if (is_power_of_2(sizeof(struct page)) && + pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap) + r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap); + else + r = vmemmap_populate(start, end, nid, altmap); + + if (r < 0) return NULL; return pfn_to_page(pfn); -- cgit v1.2.3 From 6fd3620b342861de9547ea01d28f664892ef51a1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: mm/page_alloc: reuse tail struct pages for compound devmaps Currently memmap_init_zone_device() ends up initializing 32768 pages when it only needs to initialize 128 given tail page reuse. That number is worse with 1GB compound pages, 262144 instead of 128. Update memmap_init_zone_device() to skip redundant initialization, detailed below. When a pgmap @vmemmap_shift is set, all pages are mapped at a given huge page alignment and use compound pages to describe them as opposed to a struct per 4K. With @vmemmap_shift > 0 and when struct pages are stored in ram (!altmap) most tail pages are reused. Consequently, the amount of unique struct pages is a lot smaller than the total amount of struct pages being mapped. The altmap path is left alone since it does not support memory savings based on compound pages devmap. Link: https://lkml.kernel.org/r/20220420155310.9712-6-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Cc: Christoph Hellwig Cc: Dan Williams Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_alloc.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 298c7691fb1d..e80858f5c2ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6596,6 +6596,21 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, } } +/* + * With compound page geometry and when struct pages are stored in ram most + * tail pages are reused. Consequently, the amount of unique struct pages to + * initialize is a lot smaller that the total amount of struct pages being + * mapped. This is a paired / mild layering violation with explicit knowledge + * of how the sparse_vmemmap internals handle compound pages in the lack + * of an altmap. See vmemmap_populate_compound_pages(). + */ +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, + unsigned long nr_pages) +{ + return is_power_of_2(sizeof(struct page)) && + !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages; +} + static void __ref memmap_init_compound(struct page *head, unsigned long head_pfn, unsigned long zone_idx, int nid, @@ -6660,7 +6675,7 @@ void __ref memmap_init_zone_device(struct zone *zone, continue; memmap_init_compound(page, pfn, zone_idx, nid, pgmap, - pfns_per_compound); + compound_nr_pages(altmap, pfns_per_compound)); } pr_info("%s initialised %lu pages in %ums\n", __func__, -- cgit v1.2.3 From ba91fb7dd03c4f463453f28af24851097a6547fb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: include/linux/swapops.h: remove stub for non_swap_entry() The stub for non_swap_entry() may not help much, because MAX_SWAPFILES has already contained all the information to decide whether a swap entry is real swap entry or pesudo ones (migrations, ...). There can be some performance influences on non_swap_entry() with below conditions all met: !CONFIG_MIGRATION && !CONFIG_MEMORY_FAILURE && !CONFIG_DEVICE_PRIVATE But that's definitely not the major config most machines will use, at the meantime it's already in a slow path of swap entry (being parsed from a swap pte), so IMHO it shouldn't be a major issue. Also according to the analysis from Alistair, somehow the stub didn't do the job right [1]. To make the code cleaner, let's drop the stub. [1] https://lore.kernel.org/lkml/8735ihbw6g.fsf@nvdebian.thelocal/ Note: the uffd-wp shmem & hugetlbfs series will need this patch to make sure swap entries work as expected with below config as spotted by Alistair: !CONFIG_MIGRATION && !CONFIG_MEMORY_FAILURE && !CONFIG_DEVICE_PRIVATE && CONFIG_PTE_MARKER (PS: this config should mostly never gonna happen, though, afaict..) Link: https://lkml.kernel.org/r/20220413191147.66645-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/swapops.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d356ab4047f7..5af852b68805 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -387,18 +387,10 @@ static inline void num_poisoned_pages_inc(void) } #endif -#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \ - defined(CONFIG_DEVICE_PRIVATE) static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } -#else -static inline int non_swap_entry(swp_entry_t entry) -{ - return 0; -} -#endif #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ -- cgit v1.2.3 From 7609385337a4feb6236e42dcd0df2185683ce839 Mon Sep 17 00:00:00 2001 From: xu xin Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: ksm: count ksm merging pages for each process Some applications or containers want to use KSM by calling madvise() to advise areas of address space to be MERGEABLE. But they may not know which applications are more likely to cause real merges in the deployment. If this patch is applied, it helps them know their corresponding number of merged pages, and then optimize their app code. As current KSM only counts the number of KSM merging pages(e.g. ksm_pages_sharing and ksm_pages_shared) of the whole system, we cannot see the more fine-grained KSM merging, for the upper application optimization, the merging area cannot be set easily according to the KSM page merging probability of each process. Therefore, it is necessary to add extra statistical means so that the upper level users can know the detailed KSM merging information of each process. We add a new proc file named as ksm_merging_pages under /proc// to indicate the involved ksm merging pages of this process. [akpm@linux-foundation.org: fix comment typo, remove BUG_ON()s] Link: https://lkml.kernel.org/r/20220325082318.2352853-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reported-by: kernel test robot Reviewed-by: Yang Yang Reviewed-by: Ran Xiaokai Reported-by: Zeal Robot Cc: Kees Cook Cc: Alexey Dobriyan Cc: Stephen Rothwell Cc: Ohhoon Kwon Cc: Matthew Wilcox (Oracle) Cc: Stephen Brennan Cc: Vlastimil Babka Cc: Feng Tang Cc: Yang Yang Cc: Ran Xiaokai Cc: Zeal Robot Signed-off-by: Andrew Morton --- fs/proc/base.c | 22 ++++++++++++++++++++++ include/linux/mm_types.h | 7 +++++++ mm/ksm.c | 8 ++++++++ 3 files changed, 37 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index c1031843cc6a..8dfa36a99c74 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3154,6 +3154,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_KSM +static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "%lu\n", mm->ksm_merging_pages); + mmput(mm); + } + + return 0; +} +#endif /* CONFIG_KSM */ + #ifdef CONFIG_STACKLEAK_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) @@ -3285,6 +3301,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3618,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8834e38c06a4..87eddd509de2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -654,6 +654,13 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SVA u32 pasid; +#endif +#ifdef CONFIG_KSM + /* + * Represent how many pages of this process are involved in KSM + * merging. + */ + unsigned long ksm_merging_pages; #endif } __randomize_layout; diff --git a/mm/ksm.c b/mm/ksm.c index 063a48eeb5ee..94bb0f049806 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -638,6 +638,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; + + rmap_item->mm->ksm_merging_pages--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); @@ -785,6 +788,9 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_sharing--; else ksm_pages_shared--; + + rmap_item->mm->ksm_merging_pages--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; @@ -2007,6 +2013,8 @@ static void stable_tree_append(struct rmap_item *rmap_item, ksm_pages_sharing++; else ksm_pages_shared++; + + rmap_item->mm->ksm_merging_pages++; } /* -- cgit v1.2.3 From 94bfe85bde18a99612bb5d0ce348643c2d352836 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: mm/vmstat: add events for ksm cow Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want to save memory, it's a tradeoff by suffering delay on ksm cow. Users can get to know how much memory ksm saved by reading /sys/kernel/mm/ksm/pages_sharing, but they don't know what's the costs of ksm cow, and this is important of some delay sensitive tasks. So add ksm cow events to help users evaluate whether or how to use ksm. Also update Documentation/admin-guide/mm/ksm.rst with new added events. Link: https://lkml.kernel.org/r/20220331035616.2390805-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: David Hildenbrand Reviewed-by: xu xin Reviewed-by: Ran Xiaokai Cc: Matthew Wilcox (Oracle) Cc: Jonathan Corbet Cc: Dave Hansen Cc: Saravanan D Cc: Minchan Kim Cc: John Hubbard Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 18 ++++++++++++++++++ include/linux/vm_event_item.h | 3 +++ mm/memory.c | 4 ++++ mm/vmstat.c | 3 +++ 4 files changed, 28 insertions(+) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 97d816791aca..b244f0202a03 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -184,6 +184,24 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the ``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must be increased accordingly. +Monitoring KSM events +===================== + +There are some counters in /proc/vmstat that may be used to monitor KSM events. +KSM might help save memory, it's a tradeoff by may suffering delay on KSM COW +or on swapping in copy. Those events could help users evaluate whether or how +to use KSM. For example, if cow_ksm increases too fast, user may decrease the +range of madvise(, , MADV_MERGEABLE). + +cow_ksm + is incremented every time a KSM page triggers copy on write (COW) + when users try to write to a KSM page, we have to make a copy. + +ksm_swpin_copy + is incremented every time a KSM page is copied when swapping in + note that KSM page might be copied when swapping in because do_swap_page() + cannot do all the locking needed to reconstitute a cross-anon_vma KSM page. + -- Izik Eidus, Hugh Dickins, 17 Nov 2009 diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 16a0a4fd000b..e83967e4c20e 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -133,6 +133,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSM_SWPIN_COPY, #endif #endif +#ifdef CONFIG_KSM + COW_KSM, +#endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, diff --git a/mm/memory.c b/mm/memory.c index 8895dd2f2255..e873a6143181 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3331,6 +3331,10 @@ copy: get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); +#ifdef CONFIG_KSM + if (PageKsm(vmf->page)) + count_vm_event(COW_KSM); +#endif return wp_page_copy(vmf); } diff --git a/mm/vmstat.c b/mm/vmstat.c index f2d0dec1062d..b94a2e4723ff 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1393,6 +1393,9 @@ const char * const vmstat_text[] = { "ksm_swpin_copy", #endif #endif +#ifdef CONFIG_KSM + "cow_ksm", +#endif #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", -- cgit v1.2.3 From 024c61eaff17f6a8c73cb8b25137251f0c3ef039 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:17 -0700 Subject: mm: compaction: remove unneeded return value of kcompactd_run Patch series "A few cleanup and fixup patches for compaction". This series contains a few patches to clean up some obsolete comment, remove unneeded return value and so on. Also we fix the possible NULL pointer dereference. More details can be found in the respective changelogs. This patch (of 12): The return value of kcompactd_run() is unused now. Clean it up. Link: https://lkml.kernel.org/r/20220418141253.24298-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220418141253.24298-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc; Mel Gorman Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Pintu Kumar Cc: Charan Teja Kalla Signed-off-by: Andrew Morton --- include/linux/compaction.h | 5 ++--- mm/compaction.c | 7 ++----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 34bce35c808d..52a9ff65faee 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -177,7 +177,7 @@ static inline bool compaction_withdrawn(enum compact_result result) bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); -extern int kcompactd_run(int nid); +extern void kcompactd_run(int nid); extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); @@ -212,9 +212,8 @@ static inline bool compaction_withdrawn(enum compact_result result) return true; } -static inline int kcompactd_run(int nid) +static inline void kcompactd_run(int nid) { - return 0; } static inline void kcompactd_stop(int nid) { diff --git a/mm/compaction.c b/mm/compaction.c index baf654b5e073..2e838992b324 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3016,21 +3016,18 @@ static int kcompactd(void *p) * This kcompactd start function will be called by init and node-hot-add. * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. */ -int kcompactd_run(int nid) +void kcompactd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; if (pgdat->kcompactd) - return 0; + return; pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); if (IS_ERR(pgdat->kcompactd)) { pr_err("Failed to start kcompactd on node %d\n", nid); - ret = PTR_ERR(pgdat->kcompactd); pgdat->kcompactd = NULL; } - return ret; } /* -- cgit v1.2.3 From 02d04a5163cd6bad66fd6091d2b4f53d06052aff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:17 -0700 Subject: mm: compaction: remove unneeded pfn update pfn is unused in this do while loop. Remove the unneeded pfn update. Link: https://lkml.kernel.org/r/20220418141253.24298-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 2e838992b324..f9e628a5306a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -317,7 +317,6 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, } page += (1 << PAGE_ALLOC_COSTLY_ORDER); - pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); } while (page <= end_page); return false; -- cgit v1.2.3 From 00bc102f82e0a0c70f1d4c9fdeb4c0cbc622d777 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:17 -0700 Subject: mm: compaction: remove unneeded assignment to isolate_start_pfn isolate_start_pfn is unused when cc->nr_freepages ! = 0. Otherwise cc->free_pfn will overwrite it unconditionally. So we should remove this unneeded and somewhat misleading assignment. Link: https://lkml.kernel.org/r/20220418141253.24298-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index f9e628a5306a..ee2ddf77191f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1586,7 +1586,7 @@ static void isolate_freepages(struct compact_control *cc) unsigned int stride; /* Try a small search of the free lists for a candidate */ - isolate_start_pfn = fast_isolate_freepages(cc); + fast_isolate_freepages(cc); if (cc->nr_freepages) goto splitmap; -- cgit v1.2.3 From d56c15845a5493dbe9e8b77f63418bea117d1221 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:17 -0700 Subject: mm: compaction: clean up comment for sched contention Since commit cf66f0700c8f ("mm, compaction: do not consider a need to reschedule as contention"), async compaction won't abort when scheduling is needed. Correct the relevant comment accordingly. Link: https://lkml.kernel.org/r/20220418141253.24298-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 11 ++++------- mm/internal.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index ee2ddf77191f..e839b26fb3d8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -513,15 +513,12 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, * very heavily contended. The lock should be periodically unlocked to avoid * having disabled IRQs for a long time, even when there is nobody waiting on * the lock. It might also be that allowing the IRQs will result in - * need_resched() becoming true. If scheduling is needed, async compaction - * aborts. Sync compaction schedules. + * need_resched() becoming true. If scheduling is needed, compaction schedules. * Either compaction type will also abort if a fatal signal is pending. * In either case if the lock was locked, it is dropped and not regained. * - * Returns true if compaction should abort due to fatal signal pending, or - * async compaction due to need_resched() - * Returns false when compaction can continue (sync compaction might have - * scheduled) + * Returns true if compaction should abort due to fatal signal pending. + * Returns false when compaction can continue. */ static bool compact_unlock_should_abort(spinlock_t *lock, unsigned long flags, bool *locked, struct compact_control *cc) @@ -574,7 +571,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort if fatal signal - * pending or async compaction detects need_resched() + * pending. */ if (!(blockpfn % SWAP_CLUSTER_MAX) && compact_unlock_should_abort(&cc->zone->lock, flags, diff --git a/mm/internal.h b/mm/internal.h index 69a5eabe0943..ddd09245a6db 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -402,7 +402,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ - bool contended; /* Signal lock or sched contention */ + bool contended; /* Signal lock contention */ bool rescan; /* Rescanning the same pageblock */ bool alloc_contig; /* alloc_contig_range allocation */ }; -- cgit v1.2.3 From 85f73e6d752de325d7570293e909140bc3280d37 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:17 -0700 Subject: mm: compaction: clean up comment about suitable migration target recheck checked_pageblock is already removed and suitable_migration_target is not rechecked under the zone lock since commit f8224aa5a0a4 ("mm, compaction: do not recheck suitable_migration_target under lock"). Correct the comment accordingly. Link: https://lkml.kernel.org/r/20220418141253.24298-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e839b26fb3d8..97821b7d7a4f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -599,13 +599,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (!PageBuddy(page)) goto isolate_fail; - /* - * If we already hold the lock, we can skip some rechecking. - * Note that if we hold the lock now, checked_pageblock was - * already set in some previous iteration (or strict is true), - * so it is correct to skip the suitable migration target - * recheck as well. - */ + /* If we already hold the lock, we can skip some rechecking. */ if (!locked) { locked = compact_lock_irqsave(&cc->zone->lock, &flags, cc); -- cgit v1.2.3 From c036ddffe4acb0a35d2f999e3a655c89c7a5b512 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:18 -0700 Subject: mm: compaction: use COMPACT_CLUSTER_MAX in compaction.c Always use COMPACT_CLUSTER_MAX here as we're doing the compaction. Minor improvements in readability. Link: https://lkml.kernel.org/r/20220418141253.24298-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 97821b7d7a4f..1fc912ec3a98 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -573,7 +573,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * contention, to give chance to IRQs. Abort if fatal signal * pending. */ - if (!(blockpfn % SWAP_CLUSTER_MAX) + if (!(blockpfn % COMPACT_CLUSTER_MAX) && compact_unlock_should_abort(&cc->zone->lock, flags, &locked, cc)) break; @@ -862,7 +862,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * contention, to give chance to IRQs. Abort completely if * a fatal signal is pending. */ - if (!(low_pfn % SWAP_CLUSTER_MAX)) { + if (!(low_pfn % COMPACT_CLUSTER_MAX)) { if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; @@ -1614,7 +1614,7 @@ static void isolate_freepages(struct compact_control *cc) * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check resched. */ - if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages))) cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, @@ -1921,7 +1921,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * many pageblocks unsuitable, so periodically check if we * need to schedule. */ - if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages))) cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, -- cgit v1.2.3 From 66fe1cf7f581593750931266ad8f57c744e4b750 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:18 -0700 Subject: mm: compaction: use helper compound_nr in isolate_migratepages_block Use helper compound_nr to make use of compound_nr when CONFIG_64BIT and simplify the code a bit. Link: https://lkml.kernel.org/r/20220418141253.24298-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 1fc912ec3a98..55013a2af817 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -908,7 +908,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Do not report -EBUSY down the chain */ if (ret == -EBUSY) ret = 0; - low_pfn += (1UL << compound_order(page)) - 1; + low_pfn += compound_nr(page) - 1; goto isolate_fail; } -- cgit v1.2.3 From 556162bf3a8c27dd6aab180e9a1427d945a708e9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:18 -0700 Subject: mm: compaction: clean up comment about async compaction in isolate_migratepages Since commit 282722b0d258 ("mm, compaction: restrict async compaction to pageblocks of same migratetype"), async direct compaction is restricted to scan the pageblocks of same migratetype. Correct the comment accordingly. Link: https://lkml.kernel.org/r/20220418141253.24298-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 55013a2af817..562f274b2c51 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1941,12 +1941,12 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) continue; /* - * For async compaction, also only scan in MOVABLE blocks - * without huge pages. Async compaction is optimistic to see - * if the minimum amount of work satisfies the allocation. - * The cached PFN is updated as it's possible that all - * remaining blocks between source and target are unsuitable - * and the compaction scanners fail to meet. + * For async direct compaction, only scan the pageblocks of the + * same migratetype without huge pages. Async direct compaction + * is optimistic to see if the minimum amount of work satisfies + * the allocation. The cached PFN is updated as it's possible + * that all remaining blocks between source and target are + * unsuitable and the compaction scanners fail to meet. */ if (!suitable_migration_source(cc, page)) { update_cached_migrate(cc, block_end_pfn); -- cgit v1.2.3 From 3109de308987ceae413ee015038d51e2a86c7806 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:18 -0700 Subject: mm: compaction: avoid possible NULL pointer dereference in kcompactd_cpu_online It's possible that kcompactd_run could fail to run kcompactd for a hot added node and leave pgdat->kcompactd as NULL. So pgdat->kcompactd should be checked here to avoid possible NULL pointer dereference. Link: https://lkml.kernel.org/r/20220418141253.24298-10-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 562f274b2c51..82c54d70a978 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3052,7 +3052,8 @@ static int kcompactd_cpu_online(unsigned int cpu) if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kcompactd, mask); + if (pgdat->kcompactd) + set_cpus_allowed_ptr(pgdat->kcompactd, mask); } return 0; } -- cgit v1.2.3 From cff387d6a2949a46d78a4f558938ef7b4cccd41f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:18 -0700 Subject: mm: compaction: make compaction_zonelist_suitable return false when COMPACT_SUCCESS When compact_result indicates that the allocation should now succeed, i.e. compact_result = COMPACT_SUCCESS, compaction_zonelist_suitable should return false because there is no need to do compaction now. Link: https://lkml.kernel.org/r/20220418141253.24298-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 82c54d70a978..334a573485fe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2291,7 +2291,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, ac->highest_zoneidx, available); - if (compact_result != COMPACT_SKIPPED) + if (compact_result == COMPACT_CONTINUE) return true; } -- cgit v1.2.3 From fa599c44987df43eb5cd5d60e9868fc2c790d9dc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:19 -0700 Subject: mm: compaction: simplify the code in __compact_finished Since commit efe771c7603b ("mm, compaction: always finish scanning of a full pageblock"), compaction will always finish scanning a pageblock. And migrate_pfn is assured to align with pageblock_nr_pages when we reach here. So we will always return COMPACT_SUCCESS if a suitable fallback is found due to the below IS_ALIGNED check of migrate_pfn. Simplify the code to make this clear and improve the readability. No functional change intended. Link: https://lkml.kernel.org/r/20220418141253.24298-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 334a573485fe..609a76d7e051 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2134,29 +2134,16 @@ static enum compact_result __compact_finished(struct compact_control *cc) * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) { - - /* movable pages are OK in any pageblock */ - if (migratetype == MIGRATE_MOVABLE) - return COMPACT_SUCCESS; - + true, &can_steal) != -1) /* - * We are stealing for a non-movable allocation. Make - * sure we finish compacting the current pageblock - * first so it is as free as possible and we won't - * have to steal another one soon. This only applies - * to sync compaction, as async compaction operates - * on pageblocks of the same migratetype. + * Movable pages are OK in any pageblock. If we are + * stealing for a non-movable allocation, make sure + * we finish compacting the current pageblock first + * (which is assured by the above migrate_pfn align + * check) so it is as free as possible and we won't + * have to steal another one soon. */ - if (cc->mode == MIGRATE_ASYNC || - IS_ALIGNED(cc->migrate_pfn, - pageblock_nr_pages)) { - return COMPACT_SUCCESS; - } - - ret = COMPACT_CONTINUE; - break; - } + return COMPACT_SUCCESS; } out: -- cgit v1.2.3 From ca2864e52d391f8df29da78261592502ffadd9d9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:19 -0700 Subject: mm: compaction: make sure highest is above the min_pfn It's not guaranteed that highest will be above the min_pfn. If highest is below the min_pfn, migrate_pfn and free_pfn can meet prematurely and lead to some useless work. Make sure highest is above min_pfn to avoid making a futile effort. Link: https://lkml.kernel.org/r/20220418141253.24298-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Pintu Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 609a76d7e051..65970107b789 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1532,7 +1532,7 @@ fast_isolate_freepages(struct compact_control *cc) * not found, be pessimistic for direct compaction * and use the min mark. */ - if (highest) { + if (highest >= min_pfn) { page = pfn_to_page(highest); cc->free_pfn = highest; } else { -- cgit v1.2.3 From f47f758cff59c68015d6b9b9c077110df7c2c828 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 28 Apr 2022 23:16:19 -0700 Subject: drivers/base/memory: fix an unlikely reference counting issue in __add_memory_block() __add_memory_block() calls both put_device() and device_unregister() when storing the memory block into the xarray. This is incorrect because xarray doesn't take an additional reference and device_unregister() already calls put_device(). Triggering the issue looks really unlikely and its only effect should be to log a spurious warning about a ref counted issue. Link: https://lkml.kernel.org/r/d44c63d78affe844f020dc02ad6af29abc448fc4.1650611702.git.christophe.jaillet@wanadoo.fr Fixes: 4fb6eabf1037 ("drivers/base/memory.c: cache memory blocks in xarray to accelerate lookup") Signed-off-by: Christophe JAILLET Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Scott Cheloha Cc: Nathan Lynch Signed-off-by: Andrew Morton --- drivers/base/memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7222ff9b5e05..084d67fd55cc 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -636,10 +636,9 @@ static int __add_memory_block(struct memory_block *memory) } ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, GFP_KERNEL)); - if (ret) { - put_device(&memory->dev); + if (ret) device_unregister(&memory->dev); - } + return ret; } -- cgit v1.2.3 From 0a7a0f6f7f3679c906fc55e3805c1d5e2c566f55 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 29 Apr 2022 14:36:57 -0700 Subject: hugetlb: fix wrong use of nr_online_nodes Patch series "hugetlb: Fix some incorrect behavior", v3. This series fix three bugs of hugetlb: 1) Invalid use of nr_online_nodes; 2) Inconsistency between 1G hugepage and 2M hugepage; 3) Useless information in dmesg. This patch (of 4): Certain systems are designed to have sparse/discontiguous nodes. In this case, nr_online_nodes can not be used to walk through numa node. Also, a valid node may be greater than nr_online_nodes. However, in hugetlb, it is assumed that nodes are contiguous. For sparse/discontiguous nodes, the current code may treat a valid node as invalid, and will fail to allocate all hugepages on a valid node that "nid >= nr_online_nodes". As David suggested: if (tmp >= nr_online_nodes) goto invalid; Just imagine node 0 and node 2 are online, and node 1 is offline. Assuming that "node < 2" is valid is wrong. Recheck all the places that use nr_online_nodes, and repair them one by one. [liupeng256@huawei.com: v4] Link: https://lkml.kernel.org/r/20220416103526.3287348-1-liupeng256@huawei.com Link: https://lkml.kernel.org/r/20220413032915.251254-1-liupeng256@huawei.com Link: https://lkml.kernel.org/r/20220413032915.251254-2-liupeng256@huawei.com Fixes: 4178158ef8ca ("hugetlbfs: fix issue of preallocation of gigantic pages can't work") Fixes: b5389086ad7b ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Fixes: e79ce9832316 ("hugetlbfs: fix a truncation issue in hugepages parameter") Fixes: f9317f77a6e0 ("hugetlb: clean up potential spectre issue warnings") Signed-off-by: Peng Liu Suggested-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Kefeng Wang Reviewed-by: Davidlohr Bueso Reviewed-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Zhenguo Yao Cc: Muchun Song Cc: Liu Yuntao Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87ba50b270a3..6385f86a865a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2984,8 +2984,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node; - if (nid != NUMA_NO_NODE && nid >= nr_online_nodes) - return 0; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), @@ -3093,7 +3091,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) } /* do node specific alloc */ - for (i = 0; i < nr_online_nodes; i++) { + for_each_online_node(i) { if (h->max_huge_pages_node[i] > 0) { hugetlb_hstate_alloc_pages_onenode(h, i); node_specific_alloc = true; @@ -4057,7 +4055,7 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; - for (i = 0; i < nr_online_nodes; i++) + for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } @@ -4172,9 +4170,9 @@ static int __init hugepages_setup(char *s) pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 0; } - if (tmp >= nr_online_nodes) + if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; - node = array_index_nospec(tmp, nr_online_nodes); + node = array_index_nospec(tmp, MAX_NUMNODES); p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) @@ -4302,7 +4300,7 @@ static int __init default_hugepagesz_setup(char *s) */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; - for (i = 0; i < nr_online_nodes; i++) + for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; if (hstate_is_gigantic(&default_hstate)) -- cgit v1.2.3 From f87442f407af80dac4dc81c8a7772b71b36b2e09 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 29 Apr 2022 14:36:57 -0700 Subject: hugetlb: fix hugepages_setup when deal with pernode Hugepages can be specified to pernode since "hugetlbfs: extend the definition of hugepages parameter to support node allocation", but the following problem is observed. Confusing behavior is observed when both 1G and 2M hugepage is set after "numa=off". cmdline hugepage settings: hugepagesz=1G hugepages=0:3,1:3 hugepagesz=2M hugepages=0:1024,1:1024 results: HugeTLB registered 1.00 GiB page size, pre-allocated 0 pages HugeTLB registered 2.00 MiB page size, pre-allocated 1024 pages Furthermore, confusing behavior can be also observed when an invalid node behind a valid node. To fix this, never allocate any typical hugepage when an invalid parameter is received. Link: https://lkml.kernel.org/r/20220413032915.251254-3-liupeng256@huawei.com Fixes: b5389086ad7b ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Signed-off-by: Peng Liu Reviewed-by: Mike Kravetz Cc: Baolin Wang Cc: David Hildenbrand Cc: Liu Yuntao Cc: Muchun Song Cc: Zhenguo Yao Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/hugetlb.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6385f86a865a..c6228cdddd54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4122,6 +4122,20 @@ bool __init __weak hugetlb_node_alloc_supported(void) { return true; } + +static void __init hugepages_clear_pages_in_node(void) +{ + if (!hugetlb_max_hstate) { + default_hstate_max_huge_pages = 0; + memset(default_hugepages_in_node, 0, + MAX_NUMNODES * sizeof(unsigned int)); + } else { + parsed_hstate->max_huge_pages = 0; + memset(parsed_hstate->max_huge_pages_node, 0, + MAX_NUMNODES * sizeof(unsigned int)); + } +} + /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz @@ -4209,6 +4223,7 @@ static int __init hugepages_setup(char *s) invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); + hugepages_clear_pages_in_node(); return 0; } __setup("hugepages=", hugepages_setup); -- cgit v1.2.3 From f81f6e4b5eedb41045fd0ccc8ea31f4f07ce0993 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 29 Apr 2022 14:36:57 -0700 Subject: hugetlb: fix return value of __setup handlers When __setup() return '0', using invalid option values causes the entire kernel boot option string to be reported as Unknown. Hugetlb calls __setup() and will return '0' when set invalid parameter string. The following phenomenon is observed: cmdline: hugepagesz=1Y hugepages=1 dmesg: HugeTLB: unsupported hugepagesz=1Y HugeTLB: hugepages=1 does not follow a valid hugepagesz, ignoring Unknown kernel command line parameters "hugepagesz=1Y hugepages=1" Since hugetlb will print warning/error information before return for invalid parameter string, just use return '1' to avoid print again. Link: https://lkml.kernel.org/r/20220413032915.251254-4-liupeng256@huawei.com Signed-off-by: Peng Liu Reviewed-by: Muchun Song Reviewed-by: Baolin Wang Reviewed-by: Davidlohr Bueso Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Liu Yuntao Cc: Zhenguo Yao Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c6228cdddd54..15b9faeb87ef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4155,7 +4155,7 @@ static int __init hugepages_setup(char *s) if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 0; + return 1; } /* @@ -4171,7 +4171,7 @@ static int __init hugepages_setup(char *s) if (mhp == last_mhp) { pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); - return 0; + return 1; } while (*p) { @@ -4182,7 +4182,7 @@ static int __init hugepages_setup(char *s) if (p[count] == ':') { if (!hugetlb_node_alloc_supported()) { pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); - return 0; + return 1; } if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; @@ -4224,7 +4224,7 @@ static int __init hugepages_setup(char *s) invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); - return 0; + return 1; } __setup("hugepages=", hugepages_setup); @@ -4245,7 +4245,7 @@ static int __init hugepagesz_setup(char *s) if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); - return 0; + return 1; } h = size_to_hstate(size); @@ -4260,7 +4260,7 @@ static int __init hugepagesz_setup(char *s) if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); - return 0; + return 1; } /* @@ -4291,14 +4291,14 @@ static int __init default_hugepagesz_setup(char *s) parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); - return 0; + return 1; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); - return 0; + return 1; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); -- cgit v1.2.3 From 30a514002db23fd630e3e52a2bdfb05c0de03378 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 29 Apr 2022 14:36:58 -0700 Subject: mm: use for_each_online_node and node_online instead of open coding Use more generic functions to deal with issues related to online nodes. The changes will make the code simplified. Link: https://lkml.kernel.org/r/20220429030218.644635-1-liupeng256@huawei.com Signed-off-by: Peng Liu Suggested-by: Davidlohr Bueso Suggested-by: Andrew Morton Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- arch/ia64/kernel/uncached.c | 2 +- mm/hugetlb.c | 4 ++-- mm/page_ext.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 816803636a75..a0fec82c56b8 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -261,7 +261,7 @@ static int __init uncached_init(void) { int nid; - for_each_node_state(nid, N_ONLINE) { + for_each_online_node(nid) { uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid); mutex_init(&uncached_pools[nid].add_chunk_mutex); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 15b9faeb87ef..4ab62657acda 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6971,7 +6971,7 @@ void __init hugetlb_cma_reserve(int order) if (hugetlb_cma_size_in_node[nid] == 0) continue; - if (!node_state(nid, N_ONLINE)) { + if (!node_online(nid)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; hugetlb_cma_size_in_node[nid] = 0; @@ -7010,7 +7010,7 @@ void __init hugetlb_cma_reserve(int order) } reserved = 0; - for_each_node_state(nid, N_ONLINE) { + for_each_online_node(nid) { int res; char name[CMA_MAX_NAME]; diff --git a/mm/page_ext.c b/mm/page_ext.c index 2e66d934d63f..3dc715d7ac29 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -320,7 +320,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); - VM_BUG_ON(!node_state(nid, N_ONLINE)); + VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) -- cgit v1.2.3 From 25fa414adad5b91b54a93f294bf3b17239741858 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 29 Apr 2022 14:36:58 -0700 Subject: mm/khugepaged: use vma_is_anonymous Clean up the vma->vm_ops usage. Use vma_is_anonymous instead of vma->vm_ops to make it more understandable. Link: https://lkml.kernel.org/r/20220424071642.3234971-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reviewed-by: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a4e5eaf3eb01..ac53ad2c9bb1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -466,7 +466,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, S_ISREG(inode->i_mode); } - if (!vma->anon_vma || vma->vm_ops) + if (!vma->anon_vma || !vma_is_anonymous(vma)) return false; if (vma_is_temporary_stack(vma)) return false; @@ -972,7 +972,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; /* Anon VMA expected */ - if (!vma->anon_vma || vma->vm_ops) + if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; return 0; } -- cgit v1.2.3 From 9c8bbfaca1bce84664403fd7dddbef6b3ff0a05a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 29 Apr 2022 14:36:58 -0700 Subject: mm: hugetlb: add missing cache flushing in hugetlb_unshare_all_pmds() Missed calling flush_cache_range() before removing the sharing PMD entrires, otherwise data consistence issue may be occurred on some architectures whose caches are strict and require a virtual>physical translation to exist for a virtual address. Thus add it. Now no architectures enabling PMD sharing will be affected, since they do not have a VIVT cache. That means this issue can not be happened in practice so far. Link: https://lkml.kernel.org/r/47441086affcabb6ecbe403173e9283b0d904b38.1650956489.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/419b0e777c9e6d1454dcd906e0f5b752a736d335.1650781755.git.baolin.wang@linux.alibaba.com Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp") Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Peter Xu Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4ab62657acda..4e017940b26f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6886,6 +6886,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) if (start >= end) return; + flush_cache_range(vma, start, end); /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. -- cgit v1.2.3 From 07d067e4f2ceb72b9f681995cc53828caaba9e6e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 29 Apr 2022 14:36:58 -0700 Subject: kasan: fix sleeping function called from invalid context on RT kernel BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0 preempt_count: 1, expected: 0 ........... CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt16-yocto-preempt-rt #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x60/0x8c dump_stack+0x10/0x12 __might_resched.cold+0x13b/0x173 rt_spin_lock+0x5b/0xf0 ___cache_free+0xa5/0x180 qlist_free_all+0x7a/0x160 per_cpu_remove_cache+0x5f/0x70 smp_call_function_many_cond+0x4c4/0x4f0 on_each_cpu_cond_mask+0x49/0xc0 kasan_quarantine_remove_cache+0x54/0xf0 kasan_cache_shrink+0x9/0x10 kmem_cache_shrink+0x13/0x20 acpi_os_purge_cache+0xe/0x20 acpi_purge_cached_objects+0x21/0x6d acpi_initialize_objects+0x15/0x3b acpi_init+0x130/0x5ba do_one_initcall+0xe5/0x5b0 kernel_init_freeable+0x34f/0x3ad kernel_init+0x1e/0x140 ret_from_fork+0x22/0x30 When the kmem_cache_shrink() was called, the IPI was triggered, the ___cache_free() is called in IPI interrupt context, the local-lock or spin-lock will be acquired. On PREEMPT_RT kernel, these locks are replaced with sleepbale rt-spinlock, so the above problem is triggered. Fix it by moving the qlist_free_allfrom() from IPI interrupt context to task context when PREEMPT_RT is enabled. [akpm@linux-foundation.org: reduce ifdeffery] Link: https://lkml.kernel.org/r/20220401134649.2222485-1-qiang1.zhang@intel.com Signed-off-by: Zqiang Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/kasan/quarantine.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 0a9def8ce5e8..8ef098f52622 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -99,6 +99,17 @@ static unsigned long quarantine_size; static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); +#ifdef CONFIG_PREEMPT_RT +struct cpu_shrink_qlist { + raw_spinlock_t lock; + struct qlist_head qlist; +}; + +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock), +}; +#endif + /* Maximum size of the global queue. */ static unsigned long quarantine_max_size; @@ -308,10 +319,31 @@ static void qlist_move_cache(struct qlist_head *from, } } -static void per_cpu_remove_cache(void *arg) +#ifndef CONFIG_PREEMPT_RT +static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) { struct kmem_cache *cache = arg; struct qlist_head to_free = QLIST_INIT; + + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} +#else +static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) +{ + struct kmem_cache *cache = arg; + unsigned long flags; + struct cpu_shrink_qlist *sq; + + sq = this_cpu_ptr(&shrink_qlist); + raw_spin_lock_irqsave(&sq->lock, flags); + qlist_move_cache(q, &sq->qlist, cache); + raw_spin_unlock_irqrestore(&sq->lock, flags); +} +#endif + +static void per_cpu_remove_cache(void *arg) +{ struct qlist_head *q; q = this_cpu_ptr(&cpu_quarantine); @@ -322,8 +354,7 @@ static void per_cpu_remove_cache(void *arg) */ if (READ_ONCE(q->offline)) return; - qlist_move_cache(q, &to_free, cache); - qlist_free_all(&to_free, cache); + __per_cpu_remove_cache(q, arg); } /* Free all quarantined objects belonging to cache. */ @@ -341,6 +372,21 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); +#ifdef CONFIG_PREEMPT_RT + { + int cpu; + struct cpu_shrink_qlist *sq; + + for_each_online_cpu(cpu) { + sq = per_cpu_ptr(&shrink_qlist, cpu); + raw_spin_lock_irqsave(&sq->lock, flags); + qlist_move_cache(&sq->qlist, &to_free, cache); + raw_spin_unlock_irqrestore(&sq->lock, flags); + } + qlist_free_all(&to_free, cache); + } +#endif + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) -- cgit v1.2.3 From ec2a0f9c8b50865a11720651c9c536f20c578def Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 29 Apr 2022 14:36:58 -0700 Subject: kasan: mark KASAN_VMALLOC flags as kasan_vmalloc_flags_t Fix sparse warning: mm/kasan/shadow.c:496:15: warning: restricted kasan_vmalloc_flags_t degrades to integer Link: https://lkml.kernel.org/r/52d8fccdd3a48d4bdfd0ff522553bac2a13f1579.1649351254.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Cc: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index ceebcb9de7bf..b092277bf48d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -23,10 +23,10 @@ struct task_struct; typedef unsigned int __bitwise kasan_vmalloc_flags_t; -#define KASAN_VMALLOC_NONE 0x00u -#define KASAN_VMALLOC_INIT 0x01u -#define KASAN_VMALLOC_VM_ALLOC 0x02u -#define KASAN_VMALLOC_PROT_NORMAL 0x04u +#define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u) +#define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) +#define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) +#define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -- cgit v1.2.3 From d137a7cb9b2ab8155184b2da9a304afff8f84d36 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: mm/page_alloc: simplify update of pgdat in wake_all_kswapds There is no need to update last_pgdat for each zone, only update last_pgdat when iterating the first zone of a node. Link: https://lkml.kernel.org/r/20220322115635.2708989-1-chenwandun@huawei.com Signed-off-by: Chen Wandun Reviewed-by: Andrew Morton Cc: Mel Gorman Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e80858f5c2ab..f01c71e41bcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4631,9 +4631,10 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, ac->nodemask) { if (!managed_zone(zone)) continue; - if (last_pgdat != zone->zone_pgdat) + if (last_pgdat != zone->zone_pgdat) { wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); - last_pgdat = zone->zone_pgdat; + last_pgdat = zone->zone_pgdat; + } } } -- cgit v1.2.3 From 30226b69f8767a822f87c2c92bf6d4fbf545cee1 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: zram: add a huge_idle writeback mode Today it's only possible to write back as a page, idle, or huge. A user might want to writeback pages which are huge and idle first as these idle pages do not require decompression and make a good first pass for writeback. Idle writeback specifically has the advantage that a refault is unlikely given that the page has been swapped for some amount of time without being refaulted. Huge writeback has the advantage that you're guaranteed to get the maximum benefit from a single page writeback, that is, you're reclaiming one full page of memory. Pages which are compressed in zram being written back result in some benefit which is always less than a page size because of the fact that it was compressed. The primary use of this is for minimizing refaults in situations where the device has to be sensitive to storage endurance. On ChromeOS we have devices with slow eMMC and repeated writes and refaults can negatively affect performance and endurance. Link: https://lkml.kernel.org/r/20220322215821.1196994-1-bgeffon@google.com Signed-off-by: Brian Geffon Acked-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 5 +++++ drivers/block/zram/zram_drv.c | 10 ++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 54fe63745ed8..c73b16930449 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -343,6 +343,11 @@ Admin can request writeback of those idle pages at right timing via:: With the command, zram will writeback idle pages from memory to the storage. +Additionally, if a user choose to writeback only huge and idle pages +this can be accomplished with:: + + echo huge_idle > /sys/block/zramX/writeback + If an admin wants to write a specific page in zram device to the backing device, they could write a page index into the interface. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e9474b02012d..8562a7cce558 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -639,8 +639,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, #define PAGE_WB_SIG "page_index=" #define PAGE_WRITEBACK 0 -#define HUGE_WRITEBACK 1 -#define IDLE_WRITEBACK 2 +#define HUGE_WRITEBACK (1<<0) +#define IDLE_WRITEBACK (1<<1) static ssize_t writeback_store(struct device *dev, @@ -660,6 +660,8 @@ static ssize_t writeback_store(struct device *dev, mode = IDLE_WRITEBACK; else if (sysfs_streq(buf, "huge")) mode = HUGE_WRITEBACK; + else if (sysfs_streq(buf, "huge_idle")) + mode = IDLE_WRITEBACK | HUGE_WRITEBACK; else { if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) return -EINVAL; @@ -721,10 +723,10 @@ static ssize_t writeback_store(struct device *dev, zram_test_flag(zram, index, ZRAM_UNDER_WB)) goto next; - if (mode == IDLE_WRITEBACK && + if (mode & IDLE_WRITEBACK && !zram_test_flag(zram, index, ZRAM_IDLE)) goto next; - if (mode == HUGE_WRITEBACK && + if (mode & HUGE_WRITEBACK && !zram_test_flag(zram, index, ZRAM_HUGE)) goto next; /* -- cgit v1.2.3 From 94968384dde15d48263bfc59d280cd71b1259d8c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: memcg: introduce per-memcg reclaim interface This patch series adds a memory.reclaim proactive reclaim interface. The rationale behind the interface and how it works are in the first patch. This patch (of 4): Introduce a memcg interface to trigger memory reclaim on a memory cgroup. Use case: Proactive Reclaim --------------------------- A userspace proactive reclaimer can continuously probe the memcg to reclaim a small amount of memory. This gives more accurate and up-to-date workingset estimation as the LRUs are continuously sorted and can potentially provide more deterministic memory overcommit behavior. The memory overcommit controller can provide more proactive response to the changing behavior of the running applications instead of being reactive. A userspace reclaimer's purpose in this case is not a complete replacement for kswapd or direct reclaim, it is to proactively identify memory savings opportunities and reclaim some amount of cold pages set by the policy to free up the memory for more demanding jobs or scheduling new jobs. A user space proactive reclaimer is used in Google data centers. Additionally, Meta's TMO paper recently referenced a very similar interface used for user space proactive reclaim: https://dl.acm.org/doi/pdf/10.1145/3503222.3507731 Benefits of a user space reclaimer: ----------------------------------- 1) More flexible on who should be charged for the cpu of the memory reclaim. For proactive reclaim, it makes more sense to be centralized. 2) More flexible on dedicating the resources (like cpu). The memory overcommit controller can balance the cost between the cpu usage and the memory reclaimed. 3) Provides a way to the applications to keep their LRUs sorted, so, under memory pressure better reclaim candidates are selected. This also gives more accurate and uptodate notion of working set for an application. Why memory.high is not enough? ------------------------------ - memory.high can be used to trigger reclaim in a memcg and can potentially be used for proactive reclaim. However there is a big downside in using memory.high. It can potentially introduce high reclaim stalls in the target application as the allocations from the processes or the threads of the application can hit the temporary memory.high limit. - Userspace proactive reclaimers usually use feedback loops to decide how much memory to proactively reclaim from a workload. The metrics used for this are usually either refaults or PSI, and these metrics will become messy if the application gets throttled by hitting the high limit. - memory.high is a stateful interface, if the userspace proactive reclaimer crashes for any reason while triggering reclaim it can leave the application in a bad state. - If a workload is rapidly expanding, setting memory.high to proactively reclaim memory can result in actually reclaiming more memory than intended. The benefits of such interface and shortcomings of existing interface were further discussed in this RFC thread: https://lore.kernel.org/linux-mm/5df21376-7dd1-bf81-8414-32a73cea45dd@google.com/ Interface: ---------- Introducing a very simple memcg interface 'echo 10M > memory.reclaim' to trigger reclaim in the target memory cgroup. The interface is introduced as a nested-keyed file to allow for future optional arguments to be easily added to configure the behavior of reclaim. Possible Extensions: -------------------- - This interface can be extended with an additional parameter or flags to allow specifying one or more types of memory to reclaim from (e.g. file, anon, ..). - The interface can also be extended with a node mask to reclaim from specific nodes. This has use cases for reclaim-based demotion in memory tiering systens. - A similar per-node interface can also be added to support proactive reclaim and reclaim-based demotion in systems without memcg. - Add a timeout parameter to make it easier for user space to call the interface without worrying about being blocked for an undefined amount of time. For now, let's keep things simple by adding the basic functionality. [yosryahmed@google.com: worked on versions v2 onwards, refreshed to current master, updated commit message based on recent discussions and use cases] Link: https://lkml.kernel.org/r/20220425190040.2475377-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220425190040.2475377-2-yosryahmed@google.com Signed-off-by: Shakeel Butt Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Wei Xu Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Tejun Heo Cc: Zefan Li Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yu Zhao Cc: Dave Hansen Cc: Greg Thelen Cc: Chen Wandun Cc: Vaibhav Jain Cc: "Michal Koutn" Cc: Tim Chen Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 21 +++++++++++++++ mm/memcontrol.c | 45 +++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 69d7a6983f78..19bcd73cad03 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1208,6 +1208,27 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.reclaim + A write-only nested-keyed file which exists for all cgroups. + + This is a simple interface to trigger memory reclaim in the + target cgroup. + + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. + + Example:: + + echo "1G" > memory.reclaim + + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + + Please note that the kernel can over or under reclaim from + the target cgroup. If less bytes are reclaimed than the + specified amount, -EAGAIN is returned. + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1f888313462..d180ef985b17 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6355,6 +6355,46 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; + + while (nr_reclaimed < nr_to_reclaim) { + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages for + * try_to_free_mem_cgroup_pages(). + */ + if (!nr_retries) + lru_add_drain_all(); + + reclaimed = try_to_free_mem_cgroup_pages(memcg, + nr_to_reclaim - nr_reclaimed, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -6413,6 +6453,11 @@ static struct cftype memory_files[] = { .seq_show = memory_oom_group_show, .write = memory_oom_group_write, }, + { + .name = "reclaim", + .flags = CFTYPE_NS_DELEGATABLE, + .write = memory_reclaim, + }, { } /* terminate */ }; -- cgit v1.2.3 From 6c26df84e1f2f9181c0741865105a53537da842c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: selftests: cgroup: return -errno from cg_read()/cg_write() on failure Currently, cg_read()/cg_write() returns 0 on success and -1 on failure. Modify them to return the -errno on failure. Link: https://lkml.kernel.org/r/20220425190040.2475377-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Roman Gushchin Cc: Chen Wandun Cc: Dave Hansen Cc: Greg Thelen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: "Michal Koutn" Cc: Shuah Khan Cc: Tejun Heo Cc: Tim Chen Cc: Vaibhav Jain Cc: Wei Xu Cc: Yu Zhao Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/cgroup_util.c | 44 ++++++++++++---------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index dbaa7aabbb4a..e6f3679cdcc0 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -19,6 +19,7 @@ #include "cgroup_util.h" #include "../clone3/clone3_selftests.h" +/* Returns read len on success, or -errno on failure. */ static ssize_t read_text(const char *path, char *buf, size_t max_len) { ssize_t len; @@ -26,35 +27,29 @@ static ssize_t read_text(const char *path, char *buf, size_t max_len) fd = open(path, O_RDONLY); if (fd < 0) - return fd; + return -errno; len = read(fd, buf, max_len - 1); - if (len < 0) - goto out; - buf[len] = 0; -out: + if (len >= 0) + buf[len] = 0; + close(fd); - return len; + return len < 0 ? -errno : len; } +/* Returns written len on success, or -errno on failure. */ static ssize_t write_text(const char *path, char *buf, ssize_t len) { int fd; fd = open(path, O_WRONLY | O_APPEND); if (fd < 0) - return fd; + return -errno; len = write(fd, buf, len); - if (len < 0) { - close(fd); - return len; - } - close(fd); - - return len; + return len < 0 ? -errno : len; } char *cg_name(const char *root, const char *name) @@ -87,16 +82,16 @@ char *cg_control(const char *cgroup, const char *control) return ret; } +/* Returns 0 on success, or -errno on failure. */ int cg_read(const char *cgroup, const char *control, char *buf, size_t len) { char path[PATH_MAX]; + ssize_t ret; snprintf(path, sizeof(path), "%s/%s", cgroup, control); - if (read_text(path, buf, len) >= 0) - return 0; - - return -1; + ret = read_text(path, buf, len); + return ret >= 0 ? 0 : ret; } int cg_read_strcmp(const char *cgroup, const char *control, @@ -177,17 +172,15 @@ long cg_read_lc(const char *cgroup, const char *control) return cnt; } +/* Returns 0 on success, or -errno on failure. */ int cg_write(const char *cgroup, const char *control, char *buf) { char path[PATH_MAX]; - ssize_t len = strlen(buf); + ssize_t len = strlen(buf), ret; snprintf(path, sizeof(path), "%s/%s", cgroup, control); - - if (write_text(path, buf, len) == len) - return 0; - - return -1; + ret = write_text(path, buf, len); + return ret == len ? 0 : ret; } int cg_find_unified_root(char *root, size_t len) @@ -545,7 +538,8 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t else snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); - return read_text(path, buf, size); + size = read_text(path, buf, size); + return size < 0 ? -1 : size; } int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) -- cgit v1.2.3 From a3622a53e620700053b648478dbc638ad373be3b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: selftests: cgroup: fix alloc_anon_noexit() instantly freeing memory Currently, alloc_anon_noexit() calls alloc_anon() which instantly frees the allocated memory. alloc_anon_noexit() is usually used with cg_run_nowait() to run a process in the background that allocates memory. It makes sense for the background process to keep the memory allocated and not instantly free it (otherwise there is no point of running it in the background). Link: https://lkml.kernel.org/r/20220425190040.2475377-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: David Rientjes Cc: Chen Wandun Cc: Dave Hansen Cc: Greg Thelen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: "Michal Koutn" Cc: Shuah Khan Cc: Tejun Heo Cc: Tim Chen Cc: Vaibhav Jain Cc: Wei Xu Cc: Yu Zhao Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 9c1f19fe2e37..a639bf49d396 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -211,13 +211,17 @@ static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) static int alloc_anon_noexit(const char *cgroup, void *arg) { int ppid = getppid(); + size_t size = (unsigned long)arg; + char *buf, *ptr; - if (alloc_anon(cgroup, arg)) - return -1; + buf = malloc(size); + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 0; while (getppid() == ppid) sleep(1); + free(buf); return 0; } -- cgit v1.2.3 From eae3cb2e87ff84547e66211b81301a8f9122840f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 29 Apr 2022 14:37:00 -0700 Subject: selftests: cgroup: add a selftest for memory.reclaim Add a new test for memory.reclaim that verifies that the interface correctly reclaims memory as intended, from both anon and file pages. Link: https://lkml.kernel.org/r/20220425190040.2475377-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Chen Wandun Cc: Dave Hansen Cc: Greg Thelen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: "Michal Koutn" Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun Heo Cc: Tim Chen Cc: Vaibhav Jain Cc: Wei Xu Cc: Yu Zhao Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 106 +++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index a639bf49d396..ee7defb17280 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -760,6 +760,111 @@ cleanup: return ret; } +/* + * This test checks that memory.reclaim reclaims the given + * amount of memory (from both anon and file, if possible). + */ +static int test_memcg_reclaim(const char *root) +{ + int ret = KSFT_FAIL, fd, retries; + char *memcg; + long current, expected_usage, to_reclaim; + char buf[64]; + + memcg = cg_name(root, "memcg_test"); + if (!memcg) + goto cleanup; + + if (cg_create(memcg)) + goto cleanup; + + current = cg_read_long(memcg, "memory.current"); + if (current != 0) + goto cleanup; + + fd = get_temp_fd(); + if (fd < 0) + goto cleanup; + + cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd); + + /* + * If swap is enabled, try to reclaim from both anon and file, else try + * to reclaim from file only. + */ + if (is_swap_enabled()) { + cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50)); + expected_usage = MB(100); + } else + expected_usage = MB(50); + + /* + * Wait until current usage reaches the expected usage (or we run out of + * retries). + */ + retries = 5; + while (!values_close(cg_read_long(memcg, "memory.current"), + expected_usage, 10)) { + if (retries--) { + sleep(1); + continue; + } else { + fprintf(stderr, + "failed to allocate %ld for memcg reclaim test\n", + expected_usage); + goto cleanup; + } + } + + /* + * Reclaim until current reaches 30M, this makes sure we hit both anon + * and file if swap is enabled. + */ + retries = 5; + while (true) { + int err; + + current = cg_read_long(memcg, "memory.current"); + to_reclaim = current - MB(30); + + /* + * We only keep looping if we get EAGAIN, which means we could + * not reclaim the full amount. + */ + if (to_reclaim <= 0) + goto cleanup; + + + snprintf(buf, sizeof(buf), "%ld", to_reclaim); + err = cg_write(memcg, "memory.reclaim", buf); + if (!err) { + /* + * If writing succeeds, then the written amount should have been + * fully reclaimed (and maybe more). + */ + current = cg_read_long(memcg, "memory.current"); + if (!values_close(current, MB(30), 3) && current > MB(30)) + goto cleanup; + break; + } + + /* The kernel could not reclaim the full amount, try again. */ + if (err == -EAGAIN && retries--) + continue; + + /* We got an unexpected error or ran out of retries. */ + goto cleanup; + } + + ret = KSFT_PASS; +cleanup: + cg_destroy(memcg); + free(memcg); + close(fd); + + return ret; +} + static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) { long mem_max = (long)arg; @@ -1264,6 +1369,7 @@ struct memcg_test { T(test_memcg_high), T(test_memcg_high_sync), T(test_memcg_max), + T(test_memcg_reclaim), T(test_memcg_oom_events), T(test_memcg_swap_max), T(test_memcg_sock), -- cgit v1.2.3 From 1f4910b3affc982a94d665470f0f700225952108 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Fri, 29 Apr 2022 14:37:00 -0700 Subject: damon: vaddr-test: tweak code to make the logic clearer Move these two lines into the damon_for_each_region loop, it is always for testing the last region. And also avoid to use a list iterator 'r' outside the loop which is considered harmful[1]. [1]: https://lkml.org/lkml/2022/2/17/1032 Link: https://lkml.kernel.org/r/20220328115252.31675-1-xiam0nd.tong@gmail.com Signed-off-by: Xiaomeng Tong Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 1a55bb6c36c3..5431da4fe9d4 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -281,14 +281,16 @@ static void damon_test_split_evenly_succ(struct kunit *test, KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); damon_for_each_region(r, t) { - if (i == nr_pieces - 1) + if (i == nr_pieces - 1) { + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, end); break; + } KUNIT_EXPECT_EQ(test, r->ar.start, start + i++ * expected_width); KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); } - KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width); - KUNIT_EXPECT_EQ(test, r->ar.end, end); damon_free_target(t); } -- cgit v1.2.3 From 4f540f5ab4f22961ea3467cce075d77d255d493c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 29 Apr 2022 14:37:00 -0700 Subject: mm/damon/core-test: add a kunit test case for ops registration This commit adds a simple kunit test case for DAMON operations registration feature. Link: https://lkml.kernel.org/r/20220419122225.290518-1-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index b4085deb9fa0..573669566f84 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -232,6 +232,41 @@ static void damon_test_split_regions_of(struct kunit *test) damon_destroy_ctx(c); } +static void damon_test_ops_registration(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_operations ops, bak; + + /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */ + KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0); + KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0); + + /* Double-registration is prohibited */ + ops.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + ops.id = DAMON_OPS_PADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + + /* Unknown ops id cannot be registered */ + KUNIT_EXPECT_EQ(test, damon_select_ops(c, NR_DAMON_OPS), -EINVAL); + + /* Registration should success after unregistration */ + mutex_lock(&damon_ops_lock); + bak = damon_registered_ops[DAMON_OPS_VADDR]; + damon_registered_ops[DAMON_OPS_VADDR] = (struct damon_operations){}; + mutex_unlock(&damon_ops_lock); + + ops.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), 0); + + mutex_lock(&damon_ops_lock); + damon_registered_ops[DAMON_OPS_VADDR] = bak; + mutex_unlock(&damon_ops_lock); + + /* Check double-registration failure again */ + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -240,6 +275,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_merge_two), KUNIT_CASE(damon_test_merge_regions_of), KUNIT_CASE(damon_test_split_regions_of), + KUNIT_CASE(damon_test_ops_registration), {}, }; -- cgit v1.2.3 From cef4493f1aaacf797bc4ab0565682ab1143bf580 Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Fri, 29 Apr 2022 14:37:00 -0700 Subject: mm/damon: remove unnecessary type castings Remove unnecessary void* type castings. Link: https://lkml.kernel.org/r/20220421153056.8474-1-yuzhe@nfschina.com Signed-off-by: Yu Zhe Cc: SeongJae Park Cc: liqiong Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 5ce8d7c867f0..5fe42e47c57b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1042,7 +1042,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) */ static int kdamond_fn(void *data) { - struct damon_ctx *ctx = (struct damon_ctx *)data; + struct damon_ctx *ctx = data; struct damon_target *t; struct damon_region *r, *next; unsigned int max_nr_accesses = 0; -- cgit v1.2.3 From 059342d1dd4e01d634184793fa3f8437e62afaa1 Mon Sep 17 00:00:00 2001 From: Hailong Tu Date: Fri, 29 Apr 2022 14:37:00 -0700 Subject: mm/damon/reclaim: fix the timer always stays active The timer stays active even if the reclaim mechanism is never enabled. It is unnecessary overhead can be completely avoided by using module_param_cb() for enabled flag. Link: https://lkml.kernel.org/r/20220421125910.1052459-1-tuhailong@gmail.com Signed-off-by: Hailong Tu Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e34c4d0c4d93..75cfd96a6060 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -28,7 +28,6 @@ * this. */ static bool enabled __read_mostly; -module_param(enabled, bool, 0600); /* * Time threshold for cold memory regions identification in microseconds. @@ -358,11 +357,35 @@ static void damon_reclaim_timer_fn(struct work_struct *work) enabled = last_enabled; } - schedule_delayed_work(&damon_reclaim_timer, + if (enabled) + schedule_delayed_work(&damon_reclaim_timer, msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static int enabled_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + + if (enabled) + schedule_delayed_work(&damon_reclaim_timer, 0); + + return 0; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_RECLAIM (default: disabled)"); + static int damon_reclaim_after_aggregation(struct damon_ctx *c) { struct damos *s; -- cgit v1.2.3 From 322842ea3c7264936b084ac949e9070ee47a5202 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:42 -0700 Subject: mm/rmap: fix missing swap_free() in try_to_unmap() after arch_unmap_one() failed Patch series "mm: COW fixes part 2: reliable GUP pins of anonymous pages", v4. This series is the result of the discussion on the previous approach [2]. More information on the general COW issues can be found there. It is based on latest linus/master (post v5.17, with relevant core-MM changes for v5.18-rc1). This series fixes memory corruptions when a GUP pin (FOLL_PIN) was taken on an anonymous page and COW logic fails to detect exclusivity of the page to then replacing the anonymous page by a copy in the page table: The GUP pin lost synchronicity with the pages mapped into the page tables. This issue, including other related COW issues, has been summarized in [3] under 3): " 3. Intra Process Memory Corruptions due to Wrong COW (FOLL_PIN) page_maybe_dma_pinned() is used to check if a page may be pinned for DMA (using FOLL_PIN instead of FOLL_GET). While false positives are tolerable, false negatives are problematic: pages that are pinned for DMA must not be added to the swapcache. If it happens, the (now pinned) page could be faulted back from the swapcache into page tables read-only. Future write-access would detect the pinning and COW the page, losing synchronicity. For the interested reader, this is nicely documented in feb889fb40fa ("mm: don't put pinned pages into the swap cache"). Peter reports [8] that page_maybe_dma_pinned() as used is racy in some cases and can result in a violation of the documented semantics: giving false negatives because of the race. There are cases where we call it without properly taking a per-process sequence lock, turning the usage of page_maybe_dma_pinned() racy. While one case (clear_refs SOFTDIRTY tracking, see below) seems to be easy to handle, there is especially one rmap case (shrink_page_list) that's hard to fix: in the rmap world, we're not limited to a single process. The shrink_page_list() issue is really subtle. If we race with someone pinning a page, we can trigger the same issue as in the FOLL_GET case. See the detail section at the end of this mail on a discussion how bad this can bite us with VFIO or other FOLL_PIN user. It's harder to reproduce, but I managed to modify the O_DIRECT reproducer to use io_uring fixed buffers [15] instead, which ends up using FOLL_PIN | FOLL_WRITE | FOLL_LONGTERM to pin buffer pages and can similarly trigger a loss of synchronicity and consequently a memory corruption. Again, the root issue is that a write-fault on a page that has additional references results in a COW and thereby a loss of synchronicity and consequently a memory corruption if two parties believe they are referencing the same page. " This series makes GUP pins (R/O and R/W) on anonymous pages fully reliable, especially also taking care of concurrent pinning via GUP-fast, for example, also fully fixing an issue reported regarding NUMA balancing [4] recently. While doing that, it further reduces "unnecessary COWs", especially when we don't fork()/KSM and don't swapout, and fixes the COW security for hugetlb for FOLL_PIN. In summary, we track via a pageflag (PG_anon_exclusive) whether a mapped anonymous page is exclusive. Exclusive anonymous pages that are mapped R/O can directly be mapped R/W by the COW logic in the write fault handler. Exclusive anonymous pages that want to be shared (fork(), KSM) first have to be marked shared -- which will fail if there are GUP pins on the page. GUP is only allowed to take a pin on anonymous pages that are exclusive. The PT lock is the primary mechanism to synchronize modifications of PG_anon_exclusive. We synchronize against GUP-fast either via the src_mm->write_protect_seq (during fork()) or via clear/invalidate+flush of the relevant page table entry. Special care has to be taken about swap, migration, and THPs (whereby a PMD-mapping can be converted to a PTE mapping and we have to track information for subpages). Besides these, we let the rmap code handle most magic. For reliable R/O pins of anonymous pages, we need FAULT_FLAG_UNSHARE logic as part of our previous approach [2], however, it's now 100% mapcount free and I further simplified it a bit. #1 is a fix #3-#10 are mostly rmap preparations for PG_anon_exclusive handling #11 introduces PG_anon_exclusive #12 uses PG_anon_exclusive and make R/W pins of anonymous pages reliable #13 is a preparation for reliable R/O pins #14 and #15 is reused/modified GUP-triggered unsharing for R/O GUP pins make R/O pins of anonymous pages reliable #16 adds sanity check when (un)pinning anonymous pages [1] https://lkml.kernel.org/r/20220131162940.210846-1-david@redhat.com [2] https://lkml.kernel.org/r/20211217113049.23850-1-david@redhat.com [3] https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com [4] https://bugzilla.kernel.org/show_bug.cgi?id=215616 This patch (of 17): In case arch_unmap_one() fails, we already did a swap_duplicate(). let's undo that properly via swap_free(). Link: https://lkml.kernel.org/r/20220428083441.37290-1-david@redhat.com Link: https://lkml.kernel.org/r/20220428083441.37290-2-david@redhat.com Fixes: ca827d55ebaa ("mm, swap: Add infrastructure for saving page metadata on swap") Signed-off-by: David Hildenbrand Reviewed-by: Khalid Aziz Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: David Rientjes Cc: Shakeel Butt Cc: John Hubbard Cc: Jason Gunthorpe Cc: Mike Kravetz Cc: Mike Rapoport Cc: Yang Shi Cc: "Kirill A. Shutemov" Cc: "Matthew Wilcox (Oracle)" Cc: Jann Horn Cc: Michal Hocko Cc: Nadav Amit Cc: Rik van Riel Cc: Roman Gushchin Cc: Andrea Arcangeli Cc: Peter Xu Cc: Don Dutile Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jan Kara Cc: Liang Zhang Cc: Pedro Demarchi Gomes Cc: Oded Gabbay Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/rmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/rmap.c b/mm/rmap.c index ad509b3a89ee..91a63dc636ad 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1681,6 +1681,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } if (arch_unmap_one(mm, vma, address, pteval) < 0) { + swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); -- cgit v1.2.3 From 623a1ddfeb232526275ddd0c8378771e6712aad4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:42 -0700 Subject: mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range() Let's do it just like copy_page_range(), taking the seqlock and making sure the mmap_lock is held in write mode. This allows for add a VM_BUG_ON to page_needs_cow_for_dma() and properly synchronizes concurrent fork() with GUP-fast of hugetlb pages, which will be relevant for further changes. Link: https://lkml.kernel.org/r/20220428083441.37290-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++++ mm/hugetlb.c | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b9316b2c11ce..a02812178562 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1576,6 +1576,8 @@ static inline bool page_maybe_dma_pinned(struct page *page) /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for a page on the src mm. + * + * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page) @@ -1583,6 +1585,8 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, if (!is_cow_mapping(vma->vm_flags)) return false; + VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e017940b26f..7389cd8a9a87 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4735,6 +4735,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); + mmap_assert_write_locked(src); + raw_write_seqcount_begin(&src->write_protect_seq); } else { /* * For shared mappings i_mmap_rwsem must be held to call @@ -4867,10 +4869,12 @@ again: spin_unlock(dst_ptl); } - if (cow) + if (cow) { + raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); - else + } else { i_mmap_unlock_read(mapping); + } return ret; } -- cgit v1.2.3 From b51ad4f8679e50284ce35ff671767f8f0309b64a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:42 -0700 Subject: mm/memory: slightly simplify copy_present_pte() Let's move the pinning check into the caller, to simplify return code logic and prepare for further changes: relocating the page_needs_cow_for_dma() into rmap handling code. While at it, remove the unused pte parameter and simplify the comments a bit. No functional change intended. Link: https://lkml.kernel.org/r/20220428083441.37290-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory.c | 53 ++++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e873a6143181..9bdfce15e98f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -862,19 +862,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } /* - * Copy a present and normal page if necessary. + * Copy a present and normal page. * - * NOTE! The usual case is that this doesn't need to do - * anything, and can just return a positive value. That - * will let the caller know that it can just increase - * the page refcount and re-use the pte the traditional - * way. - * - * But _if_ we need to copy it because it needs to be - * pinned in the parent (and the child should get its own - * copy rather than just a reference to the same page), - * we'll do that here and return zero to let the caller - * know we're done. + * NOTE! The usual case is that this isn't required; + * instead, the caller can just increase the page refcount + * and re-use the pte the traditional way. * * And if we need a pre-allocated page but don't yet have * one, return a negative error to let the preallocation @@ -884,25 +876,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc, pte_t pte, struct page *page) + struct page **prealloc, struct page *page) { struct page *new_page; - - /* - * What we want to do is to check whether this page may - * have been pinned by the parent process. If so, - * instead of wrprotect the pte on both sides, we copy - * the page immediately so that we'll always guarantee - * the pinned page won't be randomly replaced in the - * future. - * - * The page pinning checks are just "has this mm ever - * seen pinning", along with the (inexact) check of - * the page count. That might give false positives for - * for pinning, but it will work correctly. - */ - if (likely(!page_needs_cow_for_dma(src_vma, page))) - return 1; + pte_t pte; new_page = *prealloc; if (!new_page) @@ -944,14 +921,16 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct page *page; page = vm_normal_page(src_vma, addr, pte); - if (page) { - int retval; - - retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, pte, page); - if (retval <= 0) - return retval; - + if (page && unlikely(page_needs_cow_for_dma(src_vma, page))) { + /* + * If this page may have been pinned by the parent process, + * copy the page immediately for the child so that we'll always + * guarantee the pinned page won't be randomly replaced in the + * future. + */ + return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + } else if (page) { get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; -- cgit v1.2.3 From fb3d824d1a46c5bb0584ea88f32dc2495544aebf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap() ... and move the special check for pinned pages into page_try_dup_anon_rmap() to prepare for tracking exclusive anonymous pages via a new pageflag, clearing it only after making sure that there are no GUP pins on the anonymous page. We really only care about pins on anonymous pages, because they are prone to getting replaced in the COW handler once mapped R/O. For !anon pages in cow-mappings (!VM_SHARED && VM_MAYWRITE) we shouldn't really care about that, at least not that I could come up with an example. Let's drop the is_cow_mapping() check from page_needs_cow_for_dma(), as we know we're dealing with anonymous pages. Also, drop the handling of pinned pages from copy_huge_pud() and add a comment if ever supporting anonymous pages on the PUD level. This is a preparation for tracking exclusivity of anonymous pages in the rmap code, and disallowing marking a page shared (-> failing to duplicate) if there are GUP pins on a page. Link: https://lkml.kernel.org/r/20220428083441.37290-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +---- include/linux/rmap.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- mm/huge_memory.c | 27 ++++++++------------------- mm/hugetlb.c | 16 +++++++++------- mm/memory.c | 17 ++++++++++++----- mm/migrate.c | 2 +- 6 files changed, 79 insertions(+), 37 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a02812178562..9ee3ae51e8e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1575,16 +1575,13 @@ static inline bool page_maybe_dma_pinned(struct page *page) /* * This should most likely only be called during fork() to see whether we - * should break the cow immediately for a page on the src mm. + * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page) { - if (!is_cow_mapping(vma->vm_flags)) - return false; - VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8573aae50d96..73f41544084f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -182,11 +183,57 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -static inline void page_dup_rmap(struct page *page, bool compound) +static inline void __page_dup_rmap(struct page *page, bool compound) { atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } +static inline void page_dup_file_rmap(struct page *page, bool compound) +{ + __page_dup_rmap(page, compound); +} + +/** + * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped + * anonymous page + * @page: the page to duplicate the mapping for + * @compound: the page is mapped as compound or as a small page + * @vma: the source vma + * + * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq. + * + * Duplicating the mapping can only fail if the page may be pinned; device + * private pages cannot get pinned and consequently this function cannot fail. + * + * If duplicating the mapping succeeds, the page has to be mapped R/O into + * the parent and the child. It must *not* get mapped writable after this call. + * + * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. + */ +static inline int page_try_dup_anon_rmap(struct page *page, bool compound, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(!PageAnon(page), page); + + /* + * If this page may have been pinned by the parent process, + * don't allow to duplicate the mapping but instead require to e.g., + * copy the page immediately for the child so that we'll always + * guarantee the pinned page won't be randomly replaced in the + * future on write faults. + */ + if (likely(!is_device_private_page(page) && + unlikely(page_needs_cow_for_dma(vma, page)))) + return -EBUSY; + + /* + * It's okay to share the anon page between both processes, mapping + * the page R/O into both processes. + */ + __page_dup_rmap(page, compound); + return 0; +} + /* * Called from mm/vmscan.c to handle paging out */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c468fee595ff..baf4ea6d8e1a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1097,23 +1097,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - /* - * If this page is a potentially pinned page, split and retry the fault - * with smaller page size. Normally this should not happen because the - * userspace should use MADV_DONTFORK upon pinned regions. This is a - * best effort that the pinned pages won't be replaced by another - * random page during the coming copy-on-write. - */ - if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) { + get_page(src_page); + if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) { + /* Page maybe pinned: split and retry the fault on PTEs. */ + put_page(src_page); pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); return -EAGAIN; } - - get_page(src_page); - page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); @@ -1217,14 +1210,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } - /* Please refer to comments in copy_huge_pmd() */ - if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - __split_huge_pud(vma, src_pud, addr); - return -EAGAIN; - } - + /* + * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() + * and split if duplicating fails. + */ pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7389cd8a9a87..7a6052a984e9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4806,15 +4806,18 @@ again: get_page(ptepage); /* - * This is a rare case where we see pinned hugetlb - * pages while they're prone to COW. We need to do the - * COW earlier during fork. + * Failing to duplicate the anon rmap is a rare case + * where we see pinned hugetlb pages while they're + * prone to COW. We need to do the COW earlier during + * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ - if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { + if (!PageAnon(ptepage)) { + page_dup_file_rmap(ptepage, true); + } else if (page_try_dup_anon_rmap(ptepage, true, vma)) { pte_t src_pte_old = entry; struct page *new; @@ -4861,7 +4864,6 @@ again: entry = huge_pte_wrprotect(entry); } - page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(npages, dst); } @@ -5541,7 +5543,7 @@ retry: ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, vma, haddr); } else - page_dup_rmap(page, true); + page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, haddr, ptep, new_pte); @@ -5902,7 +5904,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_unlock; if (vm_shared) { - page_dup_rmap(page, true); + page_dup_file_rmap(page, true); } else { ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); diff --git a/mm/memory.c b/mm/memory.c index 9bdfce15e98f..9f5f829a1b1f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -825,7 +825,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ get_page(page); rss[mm_counter(page)]++; - page_dup_rmap(page, false); + /* Cannot fail as these pages cannot get pinned. */ + BUG_ON(page_try_dup_anon_rmap(page, false, src_vma)); /* * We do not preserve soft-dirty information, because so @@ -921,18 +922,24 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct page *page; page = vm_normal_page(src_vma, addr, pte); - if (page && unlikely(page_needs_cow_for_dma(src_vma, page))) { + if (page && PageAnon(page)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, page); + get_page(page); + if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { + /* Page maybe pinned, we have to copy. */ + put_page(page); + return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + } + rss[mm_counter(page)]++; } else if (page) { get_page(page); - page_dup_rmap(page, false); + page_dup_file_rmap(page, false); rss[mm_counter(page)]++; } diff --git a/mm/migrate.c b/mm/migrate.c index 77592288fbc0..6ca6e89ded94 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -234,7 +234,7 @@ static bool remove_migration_pte(struct folio *folio, if (folio_test_anon(folio)) hugepage_add_anon_rmap(new, vma, pvmw.address); else - page_dup_rmap(new, true); + page_dup_file_rmap(new, true); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif -- cgit v1.2.3 From 14f9135d547060d1d0c182501201f8da19895fe3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: convert RMAP flags to a proper distinct rmap_t type We want to pass the flags to more than one anon rmap function, getting rid of special "do_page_add_anon_rmap()". So let's pass around a distinct __bitwise type and refine documentation. Link: https://lkml.kernel.org/r/20220428083441.37290-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 22 ++++++++++++++++++---- mm/memory.c | 6 +++--- mm/rmap.c | 7 ++++--- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 73f41544084f..c53a38151089 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -160,9 +160,23 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *page_get_anon_vma(struct page *page); -/* bitflags for do_page_add_anon_rmap() */ -#define RMAP_EXCLUSIVE 0x01 -#define RMAP_COMPOUND 0x02 +/* RMAP flags, currently only relevant for some anon rmap operations. */ +typedef int __bitwise rmap_t; + +/* + * No special request: if the page is a subpage of a compound page, it is + * mapped via a PTE. The mapped (sub)page is possibly shared between processes. + */ +#define RMAP_NONE ((__force rmap_t)0) + +/* The (sub)page is exclusive to a single process. */ +#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) + +/* + * The compound page is not mapped via PTEs, but instead via a single PMD and + * should be accounted accordingly. + */ +#define RMAP_COMPOUND ((__force rmap_t)BIT(1)) /* * rmap interfaces called when adding or removing pte of page @@ -171,7 +185,7 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, int flags); + unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, bool compound); void page_add_file_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/memory.c b/mm/memory.c index 9f5f829a1b1f..4cd8cadf1268 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3511,10 +3511,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; struct swap_info_struct *si = NULL; + rmap_t rmap_flags = RMAP_NONE; swp_entry_t entry; pte_t pte; int locked; - int exclusive = 0; vm_fault_t ret = 0; void *shadow = NULL; @@ -3689,7 +3689,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; - exclusive = RMAP_EXCLUSIVE; + rmap_flags |= RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) @@ -3705,7 +3705,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { - do_page_add_anon_rmap(page, vma, vmf->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); diff --git a/mm/rmap.c b/mm/rmap.c index 91a63dc636ad..23a41132995e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1181,7 +1181,8 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, bool compound) { - do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); + do_page_add_anon_rmap(page, vma, address, + compound ? RMAP_COMPOUND : RMAP_NONE); } /* @@ -1190,7 +1191,7 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int flags) + struct vm_area_struct *vma, unsigned long address, rmap_t flags) { bool compound = flags & RMAP_COMPOUND; bool first; @@ -1229,7 +1230,7 @@ void do_page_add_anon_rmap(struct page *page, /* address might be in next vma when migration races vma_adjust */ else if (first) __page_set_anon_rmap(page, vma, address, - flags & RMAP_EXCLUSIVE); + !!(flags & RMAP_EXCLUSIVE)); else __page_check_anon_rmap(page, vma, address); -- cgit v1.2.3 From f1e2db12e45baaa2d366f87c885968096c2ff5aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: remove do_page_add_anon_rmap() ... and instead convert page_add_anon_rmap() to accept flags. Passing flags instead of bools is usually nicer either way, and we want to more often also pass RMAP_EXCLUSIVE in follow up patches when detecting that an anonymous page is exclusive: for example, when restoring an anonymous page from a writable migration entry. This is a preparation for marking an anonymous page inside page_add_anon_rmap() as exclusive when RMAP_EXCLUSIVE is passed. Link: https://lkml.kernel.org/r/20220428083441.37290-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/memory.c | 4 ++-- mm/migrate.c | 3 ++- mm/rmap.c | 14 +------------- mm/swapfile.c | 2 +- 7 files changed, 8 insertions(+), 21 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c53a38151089..643801a937f3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -183,8 +183,6 @@ typedef int __bitwise rmap_t; */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, bool compound); -void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, bool compound); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index baf4ea6d8e1a..6232b6817fab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3072,7 +3072,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); if (PageAnon(new)) - page_add_anon_rmap(new, vma, mmun_start, true); + page_add_anon_rmap(new, vma, mmun_start, RMAP_COMPOUND); else page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); diff --git a/mm/ksm.c b/mm/ksm.c index 94bb0f049806..2b6692a7df6a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1156,7 +1156,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); - page_add_anon_rmap(kpage, vma, addr, false); + page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), diff --git a/mm/memory.c b/mm/memory.c index 4cd8cadf1268..8e92010f3d89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -725,7 +725,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * created when the swap entry was made. */ if (PageAnon(page)) - page_add_anon_rmap(page, vma, address, false); + page_add_anon_rmap(page, vma, address, RMAP_NONE); else /* * Currently device exclusive access only supports anonymous @@ -3705,7 +3705,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { - do_page_add_anon_rmap(page, vma, vmf->address, rmap_flags); + page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); diff --git a/mm/migrate.c b/mm/migrate.c index 6ca6e89ded94..381c7e05e8d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -240,7 +240,8 @@ static bool remove_migration_pte(struct folio *folio, #endif { if (folio_test_anon(folio)) - page_add_anon_rmap(new, vma, pvmw.address, false); + page_add_anon_rmap(new, vma, pvmw.address, + RMAP_NONE); else page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); diff --git a/mm/rmap.c b/mm/rmap.c index 23a41132995e..a0cbbf201c98 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1171,7 +1171,7 @@ static void __page_check_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped - * @compound: charge the page as compound or small page + * @flags: the rmap flags * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, @@ -1179,18 +1179,6 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, bool compound) -{ - do_page_add_anon_rmap(page, vma, address, - compound ? RMAP_COMPOUND : RMAP_NONE); -} - -/* - * Special version of the above for do_swap_page, which often runs - * into pages that are exclusively owned by the current process. - * Everybody else should continue to use page_add_anon_rmap above. - */ -void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { bool compound = flags & RMAP_COMPOUND; diff --git a/mm/swapfile.c b/mm/swapfile.c index 63c61f8b2611..1ba525a2179d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1800,7 +1800,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr, false); + page_add_anon_rmap(page, vma, addr, RMAP_NONE); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); -- cgit v1.2.3 From 28c5209dfd5f86f4398ce01bfac8508b2c4d4050 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: pass rmap flags to hugepage_add_anon_rmap() Let's prepare for passing RMAP_EXCLUSIVE, similarly as we do for page_add_anon_rmap() now. RMAP_COMPOUND is implicit for hugetlb pages and ignored. Link: https://lkml.kernel.org/r/20220428083441.37290-8-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/migrate.c | 3 ++- mm/rmap.c | 9 ++++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 643801a937f3..90e1e6925789 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -191,7 +191,7 @@ void page_add_file_rmap(struct page *, struct vm_area_struct *, void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); diff --git a/mm/migrate.c b/mm/migrate.c index 381c7e05e8d4..92e932f08be5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -232,7 +232,8 @@ static bool remove_migration_pte(struct folio *folio, pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) - hugepage_add_anon_rmap(new, vma, pvmw.address); + hugepage_add_anon_rmap(new, vma, pvmw.address, + RMAP_NONE); else page_dup_file_rmap(new, true); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); diff --git a/mm/rmap.c b/mm/rmap.c index a0cbbf201c98..32630f1b1ee1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2391,9 +2391,11 @@ void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc) * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. + * + * RMAP_COMPOUND is ignored. */ -void hugepage_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) +void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, rmap_t flags) { struct anon_vma *anon_vma = vma->anon_vma; int first; @@ -2403,7 +2405,8 @@ void hugepage_add_anon_rmap(struct page *page, /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) - __page_set_anon_rmap(page, vma, address, 0); + __page_set_anon_rmap(page, vma, address, + !!(flags & RMAP_EXCLUSIVE)); } void hugepage_add_new_anon_rmap(struct page *page, -- cgit v1.2.3 From 40f2bbf71161fa9195c7869004290003af152375 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: drop "compound" parameter from page_add_new_anon_rmap() New anonymous pages are always mapped natively: only THP/khugepaged code maps a new compound anonymous page and passes "true". Otherwise, we're just dealing with simple, non-compound pages. Let's give the interface clearer semantics and document these. Remove the PageTransCompound() sanity check from page_add_new_anon_rmap(). Link: https://lkml.kernel.org/r/20220428083441.37290-9-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 ++- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 2 +- mm/rmap.c | 11 ++++++----- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- 9 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 90e1e6925789..e4156921eea9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -185,11 +185,12 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, bool compound); + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); + void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6418083901d4..4ef5385815d3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); + page_add_new_anon_rmap(new_page, vma, addr); lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6232b6817fab..c0365280b481 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -647,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr, true); + page_add_new_anon_rmap(page, vma, haddr); lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ac53ad2c9bb1..a2560f970881 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1183,7 +1183,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); + page_add_new_anon_rmap(new_page, vma, address); lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); diff --git a/mm/memory.c b/mm/memory.c index 8e92010f3d89..3dedb575baef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -893,7 +893,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, dst_vma, addr, false); + page_add_new_anon_rmap(new_page, dst_vma, addr); lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; @@ -3058,7 +3058,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address, false); + page_add_new_anon_rmap(new_page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -3702,7 +3702,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); @@ -3852,7 +3852,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -4039,7 +4039,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 70c7dc05bbfc..fb6d7d5499f5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -610,7 +610,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); if (!is_zone_device_page(page)) lru_cache_add_inactive_or_unevictable(page, vma); get_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index 32630f1b1ee1..90f92c53476f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1226,19 +1226,22 @@ void page_add_anon_rmap(struct page *page, } /** - * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * page_add_new_anon_rmap - add mapping to a new anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped - * @compound: charge the page as compound or small page + * + * If it's a compound page, it is accounted as a compound page. As the page + * is new, it's assume to get mapped exclusively by a single process. * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, bool compound) + struct vm_area_struct *vma, unsigned long address) { + const bool compound = PageCompound(page); int nr = compound ? thp_nr_pages(page) : 1; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); @@ -1251,8 +1254,6 @@ void page_add_new_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { - /* Anon THP always mapped first with PMD */ - VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1ba525a2179d..0ad7ed7ded21 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1802,7 +1802,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (page == swapcache) { page_add_anon_rmap(page, vma, addr, RMAP_NONE); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } set_pte_at(vma->vm_mm, addr, pte, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e9bb6db002aa..dae25d985d15 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -104,7 +104,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, lru_cache_add(page); page_add_file_rmap(page, dst_vma, false); } else { - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr); lru_cache_add_inactive_or_unevictable(page, dst_vma); } -- cgit v1.2.3 From 6c54dc6c74371eebf7eddc16b4f64b8c841c1585 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively We want to mark anonymous pages exclusive, and when using page_move_anon_rmap() we know that we are the exclusive user, as properly documented. This is a preparation for marking anonymous pages exclusive in page_move_anon_rmap(). In both instances, we're holding page lock and are sure that we're the exclusive owner (page_count() == 1). hugetlb already properly uses page_move_anon_rmap() in the write fault handler. Note that in case of a PTE-mapped THP, we'll only end up calling this function if the whole THP is only referenced by the single PTE mapping a single subpage (page_count() == 1); consequently, it's fine to modify the compound page mapping inside page_move_anon_rmap(). Link: https://lkml.kernel.org/r/20220428083441.37290-10-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 ++ mm/memory.c | 1 + 2 files changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c0365280b481..04c9446b87a8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1317,6 +1317,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) try_to_free_swap(page); if (page_count(page) == 1) { pmd_t entry; + + page_move_anon_rmap(page, vma); entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) diff --git a/mm/memory.c b/mm/memory.c index 3dedb575baef..d4dba178e130 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3303,6 +3303,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * and the page is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ + page_move_anon_rmap(page, vma); unlock_page(page); wp_page_reuse(vmf); return VM_FAULT_WRITE; -- cgit v1.2.3 From 500539419fae0aeb27189b2d62a238a056ca6742 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm/huge_memory: remove outdated VM_WARN_ON_ONCE_PAGE from unmap_page() We can already theoretically fail to unmap (still having page_mapped()) in case arch_unmap_one() fails, which can happen on sparc. Failures to unmap are handled gracefully, just as if there are other references on the target page: freezing the refcount in split_huge_page_to_list() will fail if still mapped and we'll simply remap. In commit 504e070dc08f ("mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split") we already converted to VM_WARN_ON_ONCE_PAGE, let's get rid of it completely now. This is a preparation for making try_to_migrate() fail on anonymous pages with GUP pins, which will make this VM_WARN_ON_ONCE_PAGE trigger more frequently. Link: https://lkml.kernel.org/r/20220428083441.37290-11-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Yang Shi Reviewed-by: Yang Shi Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 04c9446b87a8..cb5bcd833d9e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2240,8 +2240,6 @@ static void unmap_page(struct page *page) try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); - - VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct folio *folio, unsigned long nr) -- cgit v1.2.3 From 78fbe906cc900b33ce078102e13e0e99b5b8c406 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages The basic question we would like to have a reliable and efficient answer to is: is this anonymous page exclusive to a single process or might it be shared? We need that information for ordinary/single pages, hugetlb pages, and possibly each subpage of a THP. Introduce a way to mark an anonymous page as exclusive, with the ultimate goal of teaching our COW logic to not do "wrong COWs", whereby GUP pins lose consistency with the pages mapped into the page table, resulting in reported memory corruptions. Most pageflags already have semantics for anonymous pages, however, PG_mappedtodisk should never apply to pages in the swapcache, so let's reuse that flag. As PG_has_hwpoisoned also uses that flag on the second tail page of a compound page, convert it to PG_error instead, which is marked as PF_NO_TAIL, so never used for tail pages. Use custom page flag modification functions such that we can do additional sanity checks. The semantics we'll put into some kernel doc in the future are: " PG_anon_exclusive is *usually* only expressive in combination with a page table entry. Depending on the page table entry type it might store the following information: Is what's mapped via this page table entry exclusive to the single process and can be mapped writable without further checks? If not, it might be shared and we might have to COW. For now, we only expect PTE-mapped THPs to make use of PG_anon_exclusive in subpages. For other anonymous compound folios (i.e., hugetlb), only the head page is logically mapped and holds this information. For example, an exclusive, PMD-mapped THP only has PG_anon_exclusive set on the head page. When replacing the PMD by a page table full of PTEs, PG_anon_exclusive, if set on the head page, will be set on all tail pages accordingly. Note that converting from a PTE-mapping to a PMD mapping using the same compound page is currently not possible and consequently doesn't require care. If GUP wants to take a reliable pin (FOLL_PIN) on an anonymous page, it should only pin if the relevant PG_anon_exclusive is set. In that case, the pin will be fully reliable and stay consistent with the pages mapped into the page table, as the bit cannot get cleared (e.g., by fork(), KSM) while the page is pinned. For anonymous pages that are mapped R/W, PG_anon_exclusive can be assumed to always be set because such pages cannot possibly be shared. The page table lock protecting the page table entry is the primary synchronization mechanism for PG_anon_exclusive; GUP-fast that does not take the PT lock needs special care when trying to clear the flag. Page table entry types and PG_anon_exclusive: * Present: PG_anon_exclusive applies. * Swap: the information is lost. PG_anon_exclusive was cleared. * Migration: the entry holds this information instead. PG_anon_exclusive was cleared. * Device private: PG_anon_exclusive applies. * Device exclusive: PG_anon_exclusive applies. * HW Poison: PG_anon_exclusive is stale and not changed. If the page may be pinned (FOLL_PIN), clearing PG_anon_exclusive is not allowed and the flag will stick around until the page is freed and folio->mapping is cleared. " We won't be clearing PG_anon_exclusive on destructive unmapping (i.e., zapping) of page table entries, page freeing code will handle that when also invalidate page->mapping to not indicate PageAnon() anymore. Letting information about exclusivity stick around will be an important property when adding sanity checks to unpinning code. Note that we properly clear the flag in free_pages_prepare() via PAGE_FLAGS_CHECK_AT_PREP for each individual subpage of a compound page, so there is no need to manually clear the flag. Link: https://lkml.kernel.org/r/20220428083441.37290-12-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 39 ++++++++++++++++++++++++++++++++++++++- mm/hugetlb.c | 2 ++ mm/memory.c | 11 +++++++++++ mm/memremap.c | 9 +++++++++ mm/swapfile.c | 4 ++++ tools/vm/page-types.c | 8 +++++++- 6 files changed, 71 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1ea896887ee4..b70124b9c7c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -142,6 +142,15 @@ enum pageflags { PG_readahead = PG_reclaim, + /* + * Depending on the way an anonymous folio can be mapped into a page + * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped + * THP), PG_anon_exclusive may be set only for the head page or for + * tail pages of an anonymous folio. For now, we only expect it to be + * set on tail pages for PTE-mapped THP. + */ + PG_anon_exclusive = PG_mappedtodisk, + /* Filesystems */ PG_checked = PG_owner_priv_1, @@ -176,7 +185,7 @@ enum pageflags { * Indicates that at least one subpage is hwpoisoned in the * THP. */ - PG_has_hwpoisoned = PG_mappedtodisk, + PG_has_hwpoisoned = PG_error, #endif /* non-lru isolated movable page */ @@ -1002,6 +1011,34 @@ extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); +static __always_inline int PageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + +static __always_inline void SetPageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + +static __always_inline void ClearPageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + +static __always_inline void __ClearPageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a6052a984e9..03cbb75bcb54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1677,6 +1677,8 @@ void free_huge_page(struct page *page) VM_BUG_ON_PAGE(page_mapcount(page), page); hugetlb_set_page_subpool(page, NULL); + if (PageAnon(page)) + __ClearPageAnonExclusive(page); page->mapping = NULL; restore_reserve = HPageRestoreReserve(page); ClearHPageRestoreReserve(page); diff --git a/mm/memory.c b/mm/memory.c index d4dba178e130..0b0727758c86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3667,6 +3667,17 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + /* + * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte + * must never point at an anonymous page in the swapcache that is + * PG_anon_exclusive. Sanity check that this holds and especially, that + * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity + * check after taking the PT lock and making sure that nobody + * concurrently faulted in this page and set PG_anon_exclusive. + */ + BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); + BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it diff --git a/mm/memremap.c b/mm/memremap.c index c33bcd0398c9..2b92e97cb25b 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -459,6 +459,15 @@ void free_zone_device_page(struct page *page) mem_cgroup_uncharge(page_folio(page)); + /* + * Note: we don't expect anonymous compound pages yet. Once supported + * and we could PTE-map them similar to THP, we'd have to clear + * PG_anon_exclusive on all tail pages. + */ + VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page); + if (PageAnon(page)) + __ClearPageAnonExclusive(page); + /* * When a device managed page is freed, the page->mapping field * may still contain a (stale) mapping value. For example, the diff --git a/mm/swapfile.c b/mm/swapfile.c index 0ad7ed7ded21..a7847324d476 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1796,6 +1796,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + /* See do_swap_page() */ + BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); + BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index b1ed76d9a979..381dcc00cb62 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -80,9 +80,10 @@ #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 -/* [48-] take some arbitrary free slots for expanding overloaded flags +/* [47-] take some arbitrary free slots for expanding overloaded flags * not part of kernel API */ +#define KPF_ANON_EXCLUSIVE 47 #define KPF_READAHEAD 48 #define KPF_SLOB_FREE 49 #define KPF_SLUB_FROZEN 50 @@ -138,6 +139,7 @@ static const char * const page_flag_names[] = { [KPF_SOFTDIRTY] = "f:softdirty", [KPF_ARCH_2] = "H:arch_2", + [KPF_ANON_EXCLUSIVE] = "d:anon_exclusive", [KPF_READAHEAD] = "I:readahead", [KPF_SLOB_FREE] = "P:slob_free", [KPF_SLUB_FROZEN] = "A:slub_frozen", @@ -472,6 +474,10 @@ static int bit_mask_ok(uint64_t flags) static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) { + /* Anonymous pages overload PG_mappedtodisk */ + if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) + flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); + /* SLOB/SLUB overload several page flags */ if (flags & BIT(SLAB)) { if (flags & BIT(PRIVATE)) -- cgit v1.2.3 From 6c287605fd56466e645693eff3ae7c08fba56e0a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm: remember exclusively mapped anonymous pages with PG_anon_exclusive Let's mark exclusively mapped anonymous pages with PG_anon_exclusive as exclusive, and use that information to make GUP pins reliable and stay consistent with the page mapped into the page table even if the page table entry gets write-protected. With that information at hand, we can extend our COW logic to always reuse anonymous pages that are exclusive. For anonymous pages that might be shared, the existing logic applies. As already documented, PG_anon_exclusive is usually only expressive in combination with a page table entry. Especially PTE vs. PMD-mapped anonymous pages require more thought, some examples: due to mremap() we can easily have a single compound page PTE-mapped into multiple page tables exclusively in a single process -- multiple page table locks apply. Further, due to MADV_WIPEONFORK we might not necessarily write-protect all PTEs, and only some subpages might be pinned. Long story short: once PTE-mapped, we have to track information about exclusivity per sub-page, but until then, we can just track it for the compound page in the head page and not having to update a whole bunch of subpages all of the time for a simple PMD mapping of a THP. For simplicity, this commit mostly talks about "anonymous pages", while it's for THP actually "the part of an anonymous folio referenced via a page table entry". To not spill PG_anon_exclusive code all over the mm code-base, we let the anon rmap code to handle all PG_anon_exclusive logic it can easily handle. If a writable, present page table entry points at an anonymous (sub)page, that (sub)page must be PG_anon_exclusive. If GUP wants to take a reliably pin (FOLL_PIN) on an anonymous page references via a present page table entry, it must only pin if PG_anon_exclusive is set for the mapped (sub)page. This commit doesn't adjust GUP, so this is only implicitly handled for FOLL_WRITE, follow-up commits will teach GUP to also respect it for FOLL_PIN without FOLL_WRITE, to make all GUP pins of anonymous pages fully reliable. Whenever an anonymous page is to be shared (fork(), KSM), or when temporarily unmapping an anonymous page (swap, migration), the relevant PG_anon_exclusive bit has to be cleared to mark the anonymous page possibly shared. Clearing will fail if there are GUP pins on the page: * For fork(), this means having to copy the page and not being able to share it. fork() protects against concurrent GUP using the PT lock and the src_mm->write_protect_seq. * For KSM, this means sharing will fail. For swap this means, unmapping will fail, For migration this means, migration will fail early. All three cases protect against concurrent GUP using the PT lock and a proper clear/invalidate+flush of the relevant page table entry. This fixes memory corruptions reported for FOLL_PIN | FOLL_WRITE, when a pinned page gets mapped R/O and the successive write fault ends up replacing the page instead of reusing it. It improves the situation for O_DIRECT/vmsplice/... that still use FOLL_GET instead of FOLL_PIN, if fork() is *not* involved, however swapout and fork() are still problematic. Properly using FOLL_PIN instead of FOLL_GET for these GUP users will fix the issue for them. I. Details about basic handling I.1. Fresh anonymous pages page_add_new_anon_rmap() and hugepage_add_new_anon_rmap() will mark the given page exclusive via __page_set_anon_rmap(exclusive=1). As that is the mechanism fresh anonymous pages come into life (besides migration code where we copy the page->mapping), all fresh anonymous pages will start out as exclusive. I.2. COW reuse handling of anonymous pages When a COW handler stumbles over a (sub)page that's marked exclusive, it simply reuses it. Otherwise, the handler tries harder under page lock to detect if the (sub)page is exclusive and can be reused. If exclusive, page_move_anon_rmap() will mark the given (sub)page exclusive. Note that hugetlb code does not yet check for PageAnonExclusive(), as it still uses the old COW logic that is prone to the COW security issue because hugetlb code cannot really tolerate unnecessary/wrong COW as huge pages are a scarce resource. I.3. Migration handling try_to_migrate() has to try marking an exclusive anonymous page shared via page_try_share_anon_rmap(). If it fails because there are GUP pins on the page, unmap fails. migrate_vma_collect_pmd() and __split_huge_pmd_locked() are handled similarly. Writable migration entries implicitly point at shared anonymous pages. For readable migration entries that information is stored via a new "readable-exclusive" migration entry, specific to anonymous pages. When restoring a migration entry in remove_migration_pte(), information about exlusivity is detected via the migration entry type, and RMAP_EXCLUSIVE is set accordingly for page_add_anon_rmap()/hugepage_add_anon_rmap() to restore that information. I.4. Swapout handling try_to_unmap() has to try marking the mapped page possibly shared via page_try_share_anon_rmap(). If it fails because there are GUP pins on the page, unmap fails. For now, information about exclusivity is lost. In the future, we might want to remember that information in the swap entry in some cases, however, it requires more thought, care, and a way to store that information in swap entries. I.5. Swapin handling do_swap_page() will never stumble over exclusive anonymous pages in the swap cache, as try_to_migrate() prohibits that. do_swap_page() always has to detect manually if an anonymous page is exclusive and has to set RMAP_EXCLUSIVE for page_add_anon_rmap() accordingly. I.6. THP handling __split_huge_pmd_locked() has to move the information about exclusivity from the PMD to the PTEs. a) In case we have a readable-exclusive PMD migration entry, simply insert readable-exclusive PTE migration entries. b) In case we have a present PMD entry and we don't want to freeze ("convert to migration entries"), simply forward PG_anon_exclusive to all sub-pages, no need to temporarily clear the bit. c) In case we have a present PMD entry and want to freeze, handle it similar to try_to_migrate(): try marking the page shared first. In case we fail, we ignore the "freeze" instruction and simply split ordinarily. try_to_migrate() will properly fail because the THP is still mapped via PTEs. When splitting a compound anonymous folio (THP), the information about exclusivity is implicitly handled via the migration entries: no need to replicate PG_anon_exclusive manually. I.7. fork() handling fork() handling is relatively easy, because PG_anon_exclusive is only expressive for some page table entry types. a) Present anonymous pages page_try_dup_anon_rmap() will mark the given subpage shared -- which will fail if the page is pinned. If it failed, we have to copy (or PTE-map a PMD to handle it on the PTE level). Note that device exclusive entries are just a pointer at a PageAnon() page. fork() will first convert a device exclusive entry to a present page table and handle it just like present anonymous pages. b) Device private entry Device private entries point at PageAnon() pages that cannot be mapped directly and, therefore, cannot get pinned. page_try_dup_anon_rmap() will mark the given subpage shared, which cannot fail because they cannot get pinned. c) HW poison entries PG_anon_exclusive will remain untouched and is stale -- the page table entry is just a placeholder after all. d) Migration entries Writable and readable-exclusive entries are converted to readable entries: possibly shared. I.8. mprotect() handling mprotect() only has to properly handle the new readable-exclusive migration entry: When write-protecting a migration entry that points at an anonymous page, remember the information about exclusivity via the "readable-exclusive" migration entry type. II. Migration and GUP-fast Whenever replacing a present page table entry that maps an exclusive anonymous page by a migration entry, we have to mark the page possibly shared and synchronize against GUP-fast by a proper clear/invalidate+flush to make the following scenario impossible: 1. try_to_migrate() places a migration entry after checking for GUP pins and marks the page possibly shared. 2. GUP-fast pins the page due to lack of synchronization 3. fork() converts the "writable/readable-exclusive" migration entry into a readable migration entry 4. Migration fails due to the GUP pin (failing to freeze the refcount) 5. Migration entries are restored. PG_anon_exclusive is lost -> We have a pinned page that is not marked exclusive anymore. Note that we move information about exclusivity from the page to the migration entry as it otherwise highly overcomplicates fork() and PTE-mapping a THP. III. Swapout and GUP-fast Whenever replacing a present page table entry that maps an exclusive anonymous page by a swap entry, we have to mark the page possibly shared and synchronize against GUP-fast by a proper clear/invalidate+flush to make the following scenario impossible: 1. try_to_unmap() places a swap entry after checking for GUP pins and clears exclusivity information on the page. 2. GUP-fast pins the page due to lack of synchronization. -> We have a pinned page that is not marked exclusive anymore. If we'd ever store information about exclusivity in the swap entry, similar to migration handling, the same considerations as in II would apply. This is future work. Link: https://lkml.kernel.org/r/20220428083441.37290-13-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 40 +++++++++++++++++++++++++ include/linux/swap.h | 15 +++++++--- include/linux/swapops.h | 25 ++++++++++++++++ mm/huge_memory.c | 78 ++++++++++++++++++++++++++++++++++++++++++++----- mm/hugetlb.c | 15 +++++++--- mm/ksm.c | 13 ++++++++- mm/memory.c | 33 ++++++++++++++++----- mm/migrate.c | 14 +++++++-- mm/migrate_device.c | 21 ++++++++++++- mm/mprotect.c | 8 +++-- mm/rmap.c | 61 ++++++++++++++++++++++++++++++++++---- 11 files changed, 289 insertions(+), 34 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e4156921eea9..cbe279a6f0de 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -228,6 +228,13 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, { VM_BUG_ON_PAGE(!PageAnon(page), page); + /* + * No need to check+clear for already shared pages, including KSM + * pages. + */ + if (!PageAnonExclusive(page)) + goto dup; + /* * If this page may have been pinned by the parent process, * don't allow to duplicate the mapping but instead require to e.g., @@ -239,14 +246,47 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, unlikely(page_needs_cow_for_dma(vma, page)))) return -EBUSY; + ClearPageAnonExclusive(page); /* * It's okay to share the anon page between both processes, mapping * the page R/O into both processes. */ +dup: __page_dup_rmap(page, compound); return 0; } +/** + * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly + * shared to prepare for KSM or temporary unmapping + * @page: the exclusive anonymous page to try marking possibly shared + * + * The caller needs to hold the PT lock and has to have the page table entry + * cleared/invalidated+flushed, to properly sync against GUP-fast. + * + * This is similar to page_try_dup_anon_rmap(), however, not used during fork() + * to duplicate a mapping, but instead to prepare for KSM or temporarily + * unmapping a page (swap, migration) via page_remove_rmap(). + * + * Marking the page shared can only fail if the page may be pinned; device + * private pages cannot get pinned and consequently this function cannot fail. + * + * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY + * otherwise. + */ +static inline int page_try_share_anon_rmap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); + + /* See page_try_dup_anon_rmap(). */ + if (likely(!is_device_private_page(page) && + unlikely(page_maybe_dma_pinned(page)))) + return -EBUSY; + + ClearPageAnonExclusive(page); + return 0; +} + /* * Called from mm/vmscan.c to handle paging out */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 27093b477c5f..e6d70a4156e8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -78,12 +78,19 @@ static inline int current_is_kswapd(void) #endif /* - * NUMA node memory migration support + * Page migration support. + * + * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and + * indicates that the referenced (part of) an anonymous page is exclusive to + * a single process. For SWP_MIGRATION_WRITE, that information is implicit: + * (part of) an anonymous page that are mapped writable are exclusive to a + * single process. */ #ifdef CONFIG_MIGRATION -#define SWP_MIGRATION_NUM 2 -#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) -#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) +#define SWP_MIGRATION_NUM 3 +#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) +#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 5af852b68805..6648b97244e7 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -194,6 +194,7 @@ static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || + swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || swp_type(entry) == SWP_MIGRATION_WRITE); } @@ -202,11 +203,26 @@ static inline int is_writable_migration_entry(swp_entry_t entry) return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } +static inline int is_readable_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ); +} + +static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); +} + static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } +static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) +{ + return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); +} + static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); @@ -224,6 +240,11 @@ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } +static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); @@ -244,6 +265,10 @@ static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } +static inline int is_readable_migration_entry(swp_entry_t entry) +{ + return 0; +} #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cb5bcd833d9e..231911b7bf9d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1054,7 +1054,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, swp_entry_t entry = pmd_to_swp_entry(pmd); VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (is_writable_migration_entry(entry)) { + if (!is_readable_migration_entry(entry)) { entry = make_readable_migration_entry( swp_offset(entry)); pmd = swp_entry_to_pmd(entry); @@ -1292,6 +1292,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageHead(page), page); + /* Early check when only holding the PT lock. */ + if (PageAnonExclusive(page)) + goto reuse; + if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); @@ -1306,6 +1310,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) put_page(page); } + /* Recheck after temporarily dropping the PT lock. */ + if (PageAnonExclusive(page)) { + unlock_page(page); + goto reuse; + } + /* * See do_wp_page(): we can only map the page writable if there are * no additional references. Note that we always drain the LRU @@ -1319,11 +1329,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) pmd_t entry; page_move_anon_rmap(page, vma); + unlock_page(page); +reuse: entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - unlock_page(page); spin_unlock(vmf->ptl); return VM_FAULT_WRITE; } @@ -1708,6 +1719,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); + struct page *page = pfn_swap_entry_to_page(entry); VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { @@ -1716,8 +1728,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * A protection check is difficult so * just be safe and disable write */ - entry = make_readable_migration_entry( - swp_offset(entry)); + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); @@ -1937,6 +1951,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; + bool anon_exclusive = false; unsigned long addr; int i; @@ -2018,6 +2033,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); write = is_writable_migration_entry(entry); + if (PageAnon(page)) + anon_exclusive = is_readable_exclusive_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); @@ -2029,8 +2046,26 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); uffd_wp = pmd_uffd_wp(old_pmd); + VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); + + /* + * Without "freeze", we'll simply split the PMD, propagating the + * PageAnonExclusive() flag for each PTE by setting it for + * each subpage -- no need to (temporarily) clear. + * + * With "freeze" we want to replace mapped pages by + * migration entries right away. This is only possible if we + * managed to clear PageAnonExclusive() -- see + * set_pmd_migration_entry(). + * + * In case we cannot clear PageAnonExclusive(), split the PMD + * only and let try_to_migrate_one() fail later. + */ + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) + freeze = false; } /* @@ -2052,6 +2087,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); + else if (anon_exclusive) + swp_entry = make_readable_exclusive_migration_entry( + page_to_pfn(page + i)); else swp_entry = make_readable_migration_entry( page_to_pfn(page + i)); @@ -2063,6 +2101,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); + if (anon_exclusive) + SetPageAnonExclusive(page + i); if (!write) entry = pte_wrprotect(entry); if (!young) @@ -2294,6 +2334,13 @@ static void __split_huge_page_tail(struct page *head, int tail, * * After successful get_page_unless_zero() might follow flags change, * for example lock_page() which set PG_waiters. + * + * Note that for mapped sub-pages of an anonymous THP, + * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * the migration entry instead from where remap_page() will restore it. + * We can still have PG_anon_exclusive set on effectively unmapped and + * unreferenced sub-pages of an anonymous THP: we can simply drop + * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3025,6 +3072,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; + bool anon_exclusive; pmd_t pmdval; swp_entry_t entry; pmd_t pmdswp; @@ -3034,10 +3082,19 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (anon_exclusive && page_try_share_anon_rmap(page)) { + set_pmd_at(mm, address, pvmw->pmd, pmdval); + return; + } + if (pmd_dirty(pmdval)) set_page_dirty(page); if (pmd_write(pmdval)) entry = make_writable_migration_entry(page_to_pfn(page)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); else entry = make_readable_migration_entry(page_to_pfn(page)); pmdswp = swp_entry_to_pmd(entry); @@ -3071,10 +3128,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); - if (PageAnon(new)) - page_add_anon_rmap(new, vma, mmun_start, RMAP_COMPOUND); - else + if (PageAnon(new)) { + rmap_t rmap_flags = RMAP_COMPOUND; + + if (!is_readable_migration_entry(entry)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(new, vma, mmun_start, rmap_flags); + } else { page_add_file_rmap(new, vma, true); + } + VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); /* No need to invalidate - it was non-present before */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 03cbb75bcb54..c0012b4a8eec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4790,7 +4790,7 @@ again: is_hugetlb_entry_hwpoisoned(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); - if (is_writable_migration_entry(swp_entry) && cow) { + if (!is_readable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. @@ -5190,6 +5190,8 @@ retry_avoidcopy: set_huge_ptep_writable(vma, haddr, ptep); return 0; } + VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), + old_page); /* * If the process that created a MAP_PRIVATE mapping is about to @@ -6187,12 +6189,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, } if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); + struct page *page = pfn_swap_entry_to_page(entry); - if (is_writable_migration_entry(entry)) { + if (!is_readable_migration_entry(entry)) { pte_t newpte; - entry = make_readable_migration_entry( - swp_offset(entry)); + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); set_huge_swap_pte_at(mm, address, ptep, newpte, huge_page_size(h)); diff --git a/mm/ksm.c b/mm/ksm.c index 2b6692a7df6a..38360285497a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -872,6 +872,7 @@ static inline struct stable_node *page_stable_node(struct page *page) static inline void set_page_stable_node(struct page *page, struct stable_node *stable_node) { + VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } @@ -1044,6 +1045,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, int swapped; int err = -EFAULT; struct mmu_notifier_range range; + bool anon_exclusive; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1061,9 +1063,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; + anon_exclusive = PageAnonExclusive(page); if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || - mm_tlb_flush_pending(mm)) { + anon_exclusive || mm_tlb_flush_pending(mm)) { pte_t entry; swapped = PageSwapCache(page); @@ -1091,6 +1094,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } + + if (anon_exclusive && page_try_share_anon_rmap(page)) { + set_pte_at(mm, pvmw.address, pvmw.pte, entry); + goto out_unlock; + } + if (pte_dirty(entry)) set_page_dirty(page); @@ -1149,6 +1158,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); goto out_mn; } + VM_BUG_ON_PAGE(PageAnonExclusive(page), page); + VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage); /* * No need to check ksm_use_zero_pages here: we can only have a diff --git a/mm/memory.c b/mm/memory.c index 0b0727758c86..454ecc05ad85 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -720,6 +720,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); + VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page))); + /* * No need to take a page reference as one was already * created when the swap entry was made. @@ -796,11 +798,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, rss[mm_counter(page)]++; - if (is_writable_migration_entry(entry) && + if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* - * COW mappings require pages in both - * parent and child to be set to read. + * COW mappings require pages in both parent and child + * to be set to read. A previously exclusive entry is + * now shared. */ entry = make_readable_migration_entry( swp_offset(entry)); @@ -951,6 +954,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } + VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page)); /* * If it's a shared mapping, mark it clean in @@ -2949,6 +2953,9 @@ static inline void wp_page_reuse(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; pte_t entry; + + VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page)); + /* * Clear the pages cpupid information as the existing * information potentially belongs to a now completely @@ -3273,6 +3280,13 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (PageAnon(vmf->page)) { struct page *page = vmf->page; + /* + * If the page is exclusive to this process we must reuse the + * page without further checks. + */ + if (PageAnonExclusive(page)) + goto reuse; + /* * We have to verify under page lock: these early checks are * just an optimization to avoid locking the page and freeing @@ -3305,6 +3319,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) */ page_move_anon_rmap(page, vma); unlock_page(page); +reuse: wp_page_reuse(vmf); return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == @@ -3696,11 +3711,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * that are certainly not shared because we just allocated them without * exposing them to the swapcache. */ - if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) && - (page != swapcache || page_count(page) == 1)) { - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; + if (!PageKsm(page) && (page != swapcache || page_count(page) == 1)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + vmf->flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + } rmap_flags |= RMAP_EXCLUSIVE; } flush_icache_page(vma, page); @@ -3720,6 +3736,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } + VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); diff --git a/mm/migrate.c b/mm/migrate.c index 92e932f08be5..b2678279eb43 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -177,6 +177,7 @@ static bool remove_migration_pte(struct folio *folio, DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { + rmap_t rmap_flags = RMAP_NONE; pte_t pte; swp_entry_t entry; struct page *new; @@ -211,6 +212,9 @@ static bool remove_migration_pte(struct folio *folio, else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); + if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) + rmap_flags |= RMAP_EXCLUSIVE; + if (unlikely(is_device_private_page(new))) { if (pte_write(pte)) entry = make_writable_device_private_entry( @@ -233,7 +237,7 @@ static bool remove_migration_pte(struct folio *folio, pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) hugepage_add_anon_rmap(new, vma, pvmw.address, - RMAP_NONE); + rmap_flags); else page_dup_file_rmap(new, true); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); @@ -242,7 +246,7 @@ static bool remove_migration_pte(struct folio *folio, { if (folio_test_anon(folio)) page_add_anon_rmap(new, vma, pvmw.address, - RMAP_NONE); + rmap_flags); else page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); @@ -514,6 +518,12 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_workingset(newfolio); if (folio_test_checked(folio)) folio_set_checked(newfolio); + /* + * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via + * migration entries. We can still have PG_anon_exclusive set on an + * effectively unmapped and unreferenced first sub-pages of an + * anonymous THP: we can simply copy it here via PG_mappedtodisk. + */ if (folio_test_mappedtodisk(folio)) folio_set_mappedtodisk(newfolio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index fb6d7d5499f5..5052093d0262 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -184,15 +184,34 @@ again: * set up a special migration page table entry now. */ if (trylock_page(page)) { + bool anon_exclusive; pte_t swp_pte; + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (anon_exclusive) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + + if (page_try_share_anon_rmap(page)) { + set_pte_at(mm, addr, ptep, pte); + unlock_page(page); + put_page(page); + mpfn = 0; + goto next; + } + } else { + ptep_get_and_clear(mm, addr, ptep); + } + migrate->cpages++; - ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) entry = make_writable_migration_entry( page_to_pfn(page)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry( + page_to_pfn(page)); else entry = make_readable_migration_entry( page_to_pfn(page)); diff --git a/mm/mprotect.c b/mm/mprotect.c index b69ce7a7b2b7..56060acdabd3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -152,6 +152,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pages++; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); + struct page *page = pfn_swap_entry_to_page(entry); pte_t newpte; if (is_writable_migration_entry(entry)) { @@ -159,8 +160,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * A protection check is difficult so * just be safe and disable write */ - entry = make_readable_migration_entry( - swp_offset(entry)); + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); diff --git a/mm/rmap.c b/mm/rmap.c index 90f92c53476f..0d63e7ce35cc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1088,6 +1088,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + struct page *subpage = page; page = compound_head(page); @@ -1101,6 +1102,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); + SetPageAnonExclusive(subpage); } /** @@ -1118,7 +1120,7 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); if (PageAnon(page)) - return; + goto out; /* * If the page isn't exclusively mapped into this vma, @@ -1137,6 +1139,9 @@ static void __page_set_anon_rmap(struct page *page, anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); page->index = linear_page_index(vma, address); +out: + if (exclusive) + SetPageAnonExclusive(page); } /** @@ -1198,6 +1203,8 @@ void page_add_anon_rmap(struct page *page, } else { first = atomic_inc_and_test(&page->_mapcount); } + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); + VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) { int nr = compound ? thp_nr_pages(page) : 1; @@ -1459,7 +1466,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; - bool ret = true; + bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; @@ -1515,6 +1522,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, subpage = folio_page(folio, pte_pfn(*pvmw.pte) - folio_pfn(folio)); address = pvmw.address; + anon_exclusive = folio_test_anon(folio) && + PageAnonExclusive(subpage); if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* @@ -1550,9 +1559,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } } - /* Nuke the page table entry. */ + /* + * Nuke the page table entry. When having to clear + * PageAnonExclusive(), we always have to flush. + */ flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); - if (should_defer_flush(mm, flags)) { + if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. @@ -1677,6 +1689,24 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } + if (anon_exclusive && + page_try_share_anon_rmap(subpage)) { + swap_free(entry); + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + /* + * Note: We *don't* remember yet if the page was mapped + * exclusively in the swap entry, so swapin code has + * to re-determine that manually and might detect the + * page as possibly shared, for example, if there are + * other references on the page or if the page is under + * writeback. We made sure that there are no GUP pins + * on the page that would rely on it, so for GUP pins + * this is fine. + */ if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) @@ -1776,7 +1806,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; - bool ret = true; + bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; @@ -1837,6 +1867,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, subpage = folio_page(folio, pte_pfn(*pvmw.pte) - folio_pfn(folio)); address = pvmw.address; + anon_exclusive = folio_test_anon(folio) && + PageAnonExclusive(subpage); if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* @@ -1888,6 +1920,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, swp_entry_t entry; pte_t swp_pte; + if (anon_exclusive) + BUG_ON(page_try_share_anon_rmap(subpage)); + /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration @@ -1896,6 +1931,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, entry = pte_to_swp_entry(pteval); if (is_writable_device_private_entry(entry)) entry = make_writable_migration_entry(pfn); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry(pfn); else entry = make_readable_migration_entry(pfn); swp_pte = swp_entry_to_pte(entry); @@ -1960,6 +1997,15 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } + VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && + !anon_exclusive, subpage); + if (anon_exclusive && + page_try_share_anon_rmap(subpage)) { + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } /* * Store the pfn of the page in a special migration @@ -1969,6 +2015,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_write(pteval)) entry = make_writable_migration_entry( page_to_pfn(subpage)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry( + page_to_pfn(subpage)); else entry = make_readable_migration_entry( page_to_pfn(subpage)); @@ -2405,6 +2454,8 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(compound_mapcount_ptr(page)); + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); + VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) __page_set_anon_rmap(page, vma, address, !!(flags & RMAP_EXCLUSIVE)); -- cgit v1.2.3 From 7f5abe609b3dcbc62a36e18b1437f4f3521ecb75 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm/rmap: fail try_to_migrate() early when setting a PMD migration entry fails Let's fail right away in case we cannot clear PG_anon_exclusive because the anon THP may be pinned. Right now, we continue trying to install migration entries and the caller of try_to_migrate() will realize that the page is still mapped and has to restore the migration entries. Let's just fail fast just like for PTE migration entries. Link: https://lkml.kernel.org/r/20220428083441.37290-14-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/swapops.h | 4 ++-- mm/huge_memory.c | 8 +++++--- mm/rmap.c | 6 +++++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6648b97244e7..e476a8fed537 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -299,7 +299,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, +extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, @@ -332,7 +332,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else -static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, +static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 231911b7bf9d..14d7fa6dc793 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3066,7 +3066,7 @@ late_initcall(split_huge_pages_debugfs); #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, +int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { struct vm_area_struct *vma = pvmw->vma; @@ -3078,7 +3078,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmd_t pmdswp; if (!(pvmw->pmd && !pvmw->pte)) - return; + return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); @@ -3086,7 +3086,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); - return; + return -EBUSY; } if (pmd_dirty(pmdval)) @@ -3104,6 +3104,8 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, page_remove_rmap(page, vma, true); put_page(page); trace_set_migration_pmd(address, pmd_val(pmdswp)); + + return 0; } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) diff --git a/mm/rmap.c b/mm/rmap.c index 0d63e7ce35cc..f96cc7eb23ec 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1856,7 +1856,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); - set_pmd_migration_entry(&pvmw, subpage); + if (set_pmd_migration_entry(&pvmw, subpage)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } continue; } #endif -- cgit v1.2.3 From 8909691b6c5a84b67573b23ee8bb917b005628f0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm/gup: disallow follow_page(FOLL_PIN) We want to change the way we handle R/O pins on anonymous pages that might be shared: if we detect a possibly shared anonymous page -- mapped R/O and not !PageAnonExclusive() -- we want to trigger unsharing via a page fault, resulting in an exclusive anonymous page that can be pinned reliably without getting replaced via COW on the next write fault. However, the required page fault will be problematic for follow_page(): in contrast to ordinary GUP, follow_page() doesn't trigger faults internally. So we would have to end up failing a R/O pin via follow_page(), although there is something mapped R/O into the page table, which might be rather surprising. We don't seem to have follow_page(FOLL_PIN) users, and it's a purely internal MM function. Let's just make our life easier and the semantics of follow_page() clearer by just disallowing FOLL_PIN for follow_page() completely. Link: https://lkml.kernel.org/r/20220428083441.37290-15-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 3 +++ mm/hugetlb.c | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f598a037eb04..f424abf5e792 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -787,6 +787,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (vma_is_secretmem(vma)) return NULL; + if (foll_flags & FOLL_PIN) + return NULL; + page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c0012b4a8eec..e0308a44ba16 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6719,9 +6719,11 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, spinlock_t *ptl; pte_t pte; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; retry: -- cgit v1.2.3 From c89357e27f20dda3fff6791d27bb6c91eae99f4a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm: support GUP-triggered unsharing of anonymous pages Whenever GUP currently ends up taking a R/O pin on an anonymous page that might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault on the page table entry will end up replacing the mapped anonymous page due to COW, resulting in the GUP pin no longer being consistent with the page actually mapped into the page table. The possible ways to deal with this situation are: (1) Ignore and pin -- what we do right now. (2) Fail to pin -- which would be rather surprising to callers and could break user space. (3) Trigger unsharing and pin the now exclusive page -- reliable R/O pins. We want to implement 3) because it provides the clearest semantics and allows for checking in unpin_user_pages() and friends for possible BUGs: when trying to unpin a page that's no longer exclusive, clearly something went very wrong and might result in memory corruptions that might be hard to debug. So we better have a nice way to spot such issues. To implement 3), we need a way for GUP to trigger unsharing: FAULT_FLAG_UNSHARE. FAULT_FLAG_UNSHARE is only applicable to R/O mapped anonymous pages and resembles COW logic during a write fault. However, in contrast to a write fault, GUP-triggered unsharing will, for example, still maintain the write protection. Let's implement FAULT_FLAG_UNSHARE by hooking into the existing write fault handlers for all applicable anonymous page types: ordinary pages, THP and hugetlb. * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that has been marked exclusive in the meantime by someone else, there is nothing to do. * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that's not marked exclusive, it will try detecting if the process is the exclusive owner. If exclusive, it can be set exclusive similar to reuse logic during write faults via page_move_anon_rmap() and there is nothing else to do; otherwise, we either have to copy and map a fresh, anonymous exclusive page R/O (ordinary pages, hugetlb), or split the THP. This commit is heavily based on patches by Andrea. Link: https://lkml.kernel.org/r/20220428083441.37290-16-david@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Co-developed-by: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++ mm/huge_memory.c | 10 ++++- mm/hugetlb.c | 56 +++++++++++++++---------- mm/memory.c | 107 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 126 insertions(+), 55 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 87eddd509de2..7216d77a5884 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -819,6 +819,9 @@ typedef struct { * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark + * exclusive) a possibly shared anonymous page that is + * mapped R/O. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -838,6 +841,10 @@ typedef struct { * continuous faults with flags (b). We should always try to detect pending * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. + * + * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. + * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when + * no existing R/O-mapped anonymous page is encountered. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, @@ -850,6 +857,7 @@ enum fault_flag { FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_UNSHARE = 1 << 10, }; #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 14d7fa6dc793..17ec6c939e7f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1271,6 +1271,7 @@ unlock: vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -1279,6 +1280,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + if (is_huge_zero_pmd(orig_pmd)) goto fallback; @@ -1317,7 +1321,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } /* - * See do_wp_page(): we can only map the page writable if there are + * See do_wp_page(): we can only reuse the page exclusively if there are * no additional references. Note that we always drain the LRU * pagevecs immediately after adding a THP. */ @@ -1331,6 +1335,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page_move_anon_rmap(page, vma); unlock_page(page); reuse: + if (unlikely(unshare)) { + spin_unlock(vmf->ptl); + return 0; + } entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e0308a44ba16..9cefa3dd9af3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5162,15 +5162,16 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Hugetlb_cow() should be called with page lock of the original hugepage held. + * hugetlb_wp() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int flags, struct page *pagecache_page, spinlock_t *ptl) { + const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@ -5179,15 +5180,22 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; + VM_BUG_ON(unshare && (flags & FOLL_WRITE)); + VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: - /* If no-one else is actually using this page, avoid the copy - * and just make the page writable */ + /* + * If no-one else is actually using this page, we're the exclusive + * owner and can reuse this page. + */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - page_move_anon_rmap(old_page, vma); - set_huge_ptep_writable(vma, haddr, ptep); + if (!PageAnonExclusive(old_page)) + page_move_anon_rmap(old_page, vma); + if (likely(!unshare)) + set_huge_ptep_writable(vma, haddr, ptep); return 0; } VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), @@ -5290,13 +5298,13 @@ retry_avoidcopy: if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearHPageRestoreReserve(new_page); - /* Break COW */ + /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); + make_huge_pte(vma, new_page, !unshare)); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; @@ -5304,7 +5312,10 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: - /* No restore in case of successful pagetable update (Break COW) */ + /* + * No restore in case of successful pagetable update (Break COW or + * unshare) + */ if (new_page != old_page) restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); @@ -5429,7 +5440,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed - * COW. Warn that such a situation has occurred as it may not be obvious + * COW/unsharing. Warn that such a situation has occurred as it may not + * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", @@ -5555,7 +5567,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); } spin_unlock(ptl); @@ -5685,14 +5697,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex; /* - * If we are going to COW the mapping later, we examine the pending - * reservations for this page now. This will ensure that any + * If we are going to COW/unshare the mapping later, we examine the + * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. For private mappings, we also lookup the pagecache * page now as it is used to determine if a reservation has been * consumed. */ - if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -5707,12 +5720,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); - /* Check for a racing update before calling hugetlb_cow */ + /* Check for a racing update before calling hugetlb_wp() */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_ptl; /* - * hugetlb_cow() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(entry) and * pagecache_page, so here we need take the former one * when page != pagecache_page or !pagecache_page. */ @@ -5725,13 +5738,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(page); - if (flags & FAULT_FLAG_WRITE) { + if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, - pagecache_page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, + pagecache_page, ptl); goto out_put_page; + } else if (likely(flags & FAULT_FLAG_WRITE)) { + entry = huge_pte_mkdirty(entry); } - entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, diff --git a/mm/memory.c b/mm/memory.c index 454ecc05ad85..a75040a47fcc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2745,8 +2745,8 @@ static inline int pte_unmap_same(struct vm_fault *vmf) return same; } -static inline bool cow_user_page(struct page *dst, struct page *src, - struct vm_fault *vmf) +static inline bool __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) { bool ret; void *kaddr; @@ -2954,6 +2954,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) struct page *page = vmf->page; pte_t entry; + VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page)); /* @@ -2974,7 +2975,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf) } /* - * Handle the case of a page which we actually need to copy to a new page. + * Handle the case of a page which we actually need to copy to a new page, + * either due to COW or unsharing. * * Called with mmap_lock locked and the old page referenced, but * without the ptl held. @@ -2991,6 +2993,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct page *old_page = vmf->page; @@ -3013,7 +3016,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; - if (!cow_user_page(new_page, old_page, vmf)) { + if (!__wp_page_copy_user(new_page, old_page, vmf)) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3055,7 +3058,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (unlikely(unshare)) { + if (pte_soft_dirty(vmf->orig_pte)) + entry = pte_mksoft_dirty(entry); + if (pte_uffd_wp(vmf->orig_pte)) + entry = pte_mkuffd_wp(entry); + } else { + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } /* * Clear the pte entry and flush it first, before updating the @@ -3072,6 +3082,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ + BUG_ON(unshare && pte_write(entry)); set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { @@ -3121,7 +3132,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) free_swap_cache(old_page); put_page(old_page); } - return page_copied ? VM_FAULT_WRITE : 0; + return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; oom_free_new: put_page(new_page); oom: @@ -3221,18 +3232,22 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) } /* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. + * This routine handles present pages, when + * * users try to write to a shared page (FAULT_FLAG_WRITE) + * * GUP wants to take a R/O pin on a possibly shared anonymous page + * (FAULT_FLAG_UNSHARE) + * + * It is done by copying the page to a new address and decrementing the + * shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. + * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've + * done any necessary COW. * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. + * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even + * though the page will change only once the write actually happens. This + * avoids a few races, and potentially makes it more efficient. * * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. @@ -3241,23 +3256,35 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; - if (userfaultfd_pte_wp(vma, *vmf->pte)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); - } + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); - /* - * Userfaultfd write-protect can defer flushes. Ensure the TLB - * is flushed in this case before copying. - */ - if (unlikely(userfaultfd_wp(vmf->vma) && - mm_tlb_flush_pending(vmf->vma->vm_mm))) - flush_tlb_page(vmf->vma, vmf->address); + if (likely(!unshare)) { + if (userfaultfd_pte_wp(vma, *vmf->pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Userfaultfd write-protect can defer flushes. Ensure the TLB + * is flushed in this case before copying. + */ + if (unlikely(userfaultfd_wp(vmf->vma) && + mm_tlb_flush_pending(vmf->vma->vm_mm))) + flush_tlb_page(vmf->vma, vmf->address); + } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { + if (unlikely(unshare)) { + /* No anonymous page -> nothing to do. */ + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -3320,8 +3347,16 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) page_move_anon_rmap(page, vma); unlock_page(page); reuse: + if (unlikely(unshare)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } wp_page_reuse(vmf); return VM_FAULT_WRITE; + } else if (unshare) { + /* No anonymous page -> nothing to do. */ + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); @@ -4523,8 +4558,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + if (vma_is_anonymous(vmf->vma)) { - if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) + if (likely(!unshare) && + userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } @@ -4659,10 +4697,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } - if (vmf->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); - entry = pte_mkdirty(entry); + else if (likely(vmf->flags & FAULT_FLAG_WRITE)) + entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, @@ -4703,7 +4742,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; - unsigned int dirty = flags & FAULT_FLAG_WRITE; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; @@ -4728,9 +4766,11 @@ retry_pud: barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { - /* NUMA case for anonymous PUDs would go here */ - - if (dirty && !pud_write(orig_pud)) { + /* + * TODO once we support anonymous PUDs: NUMA case and + * FAULT_FLAG_UNSHARE handling. + */ + if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4768,7 +4808,8 @@ retry_pud: if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); - if (dirty && !pmd_write(vmf.orig_pmd)) { + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- cgit v1.2.3 From a7f226604170acd6b142b76472c1a49c12ebb83d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm/gup: trigger FAULT_FLAG_UNSHARE when R/O-pinning a possibly shared anonymous page Whenever GUP currently ends up taking a R/O pin on an anonymous page that might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault on the page table entry will end up replacing the mapped anonymous page due to COW, resulting in the GUP pin no longer being consistent with the page actually mapped into the page table. The possible ways to deal with this situation are: (1) Ignore and pin -- what we do right now. (2) Fail to pin -- which would be rather surprising to callers and could break user space. (3) Trigger unsharing and pin the now exclusive page -- reliable R/O pins. Let's implement 3) because it provides the clearest semantics and allows for checking in unpin_user_pages() and friends for possible BUGs: when trying to unpin a page that's no longer exclusive, clearly something went very wrong and might result in memory corruptions that might be hard to debug. So we better have a nice way to spot such issues. This change implies that whenever user space *wrote* to a private mapping (IOW, we have an anonymous page mapped), that GUP pins will always remain consistent: reliable R/O GUP pins of anonymous pages. As a side note, this commit fixes the COW security issue for hugetlb with FOLL_PIN as documented in: https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com The vmsplice reproducer still applies, because vmsplice uses FOLL_GET instead of FOLL_PIN. Note that follow_huge_pmd() doesn't apply because we cannot end up in there with FOLL_PIN. This commit is heavily based on prototype patches by Andrea. Link: https://lkml.kernel.org/r/20220428083441.37290-17-david@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Co-developed-by: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 39 +++++++++++++++++++++++++++++++++++++++ mm/gup.c | 42 +++++++++++++++++++++++++++++++++++++++--- mm/huge_memory.c | 3 +++ mm/hugetlb.c | 27 ++++++++++++++++++++++++--- 4 files changed, 105 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ee3ae51e8e3..5dc5005ccc9b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3002,6 +3002,45 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): + * clear/invalidate+flush of the page table entry + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + */ +static inline bool gup_must_unshare(unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) + return false; + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); diff --git a/mm/gup.c b/mm/gup.c index f424abf5e792..4d089d467e58 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -506,6 +506,10 @@ retry: } } + if (!pte_write(pte) && gup_must_unshare(flags, page)) { + page = ERR_PTR(-EMLINK); + goto out; + } /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { page = ERR_PTR(-ENOMEM); @@ -732,6 +736,11 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches * the device's dev_pagemap metadata to avoid repeating expensive lookups. * + * When getting an anonymous page and the caller has to trigger unsharing + * of a shared anonymous page first, -EMLINK is returned. The caller should + * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only + * relevant with FOLL_PIN and !FOLL_WRITE. + * * On output, the @ctx->page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or @@ -855,7 +864,8 @@ unmap: * is, *@locked will be set to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, int *locked) + unsigned long address, unsigned int *flags, bool unshare, + int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; @@ -877,6 +887,11 @@ static int faultin_page(struct vm_area_struct *vma, */ fault_flags |= FAULT_FLAG_TRIED; } + if (unshare) { + fault_flags |= FAULT_FLAG_UNSHARE; + /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ + VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); + } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_ERROR) { @@ -1098,8 +1113,9 @@ retry: cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); - if (!page) { - ret = faultin_page(vma, start, &foll_flags, locked); + if (!page || PTR_ERR(page) == -EMLINK) { + ret = faultin_page(vma, start, &foll_flags, + PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; @@ -2201,6 +2217,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, goto pte_unmap; } + if (!pte_write(pte) && gup_must_unshare(flags, page)) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please @@ -2381,6 +2402,11 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } + if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + *nr += refs; folio_set_referenced(folio); return 1; @@ -2442,6 +2468,11 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + *nr += refs; folio_set_referenced(folio); return 1; @@ -2477,6 +2508,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + *nr += refs; folio_set_referenced(folio); return 1; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 17ec6c939e7f..4414111998df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1389,6 +1389,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + return ERR_PTR(-EMLINK); + if (!try_grab_page(page, flags)) return ERR_PTR(-ENOMEM); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9cefa3dd9af3..5f8d3a77a2ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5982,6 +5982,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, } } +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, + bool *unshare) +{ + pte_t pteval = huge_ptep_get(pte); + + *unshare = false; + if (is_swap_pte(pteval)) + return true; + if (huge_pte_write(pteval)) + return false; + if (flags & FOLL_WRITE) + return true; + if (gup_must_unshare(flags, pte_page(pteval))) { + *unshare = true; + return true; + } + return false; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -5996,6 +6015,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, while (vaddr < vma->vm_end && remainder) { pte_t *pte; spinlock_t *ptl = NULL; + bool unshare = false; int absent; struct page *page; @@ -6046,9 +6066,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * both cases, and because we can't follow correct pages * directly from any kind of swap entries. */ - if (absent || is_swap_pte(huge_ptep_get(pte)) || - ((flags & FOLL_WRITE) && - !huge_pte_write(huge_ptep_get(pte)))) { + if (absent || + __follow_hugetlb_must_fault(flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; @@ -6056,6 +6075,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; + else if (unshare) + fault_flags |= FAULT_FLAG_UNSHARE; if (locked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; -- cgit v1.2.3 From b6a2619c60b41a929bbb9c09f193d690d707b1af Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm/gup: sanity-check with CONFIG_DEBUG_VM that anonymous pages are exclusive when (un)pinning Let's verify when (un)pinning anonymous pages that we always deal with exclusive anonymous pages, which guarantees that we'll have a reliable PIN, meaning that we cannot end up with the GUP pin being inconsistent with he pages mapped into the page tables due to a COW triggered by a write fault. When pinning pages, after conditionally triggering GUP unsharing of possibly shared anonymous pages, we should always only see exclusive anonymous pages. Note that anonymous pages that are mapped writable must be marked exclusive, otherwise we'd have a BUG. When pinning during ordinary GUP, simply add a check after our conditional GUP-triggered unsharing checks. As we know exactly how the page is mapped, we know exactly in which page we have to check for PageAnonExclusive(). When pinning via GUP-fast we have to be careful, because we can race with fork(): verify only after we made sure via the seqcount that we didn't race with concurrent fork() that we didn't end up pinning a possibly shared anonymous page. Similarly, when unpinning, verify that the pages are still marked as exclusive: otherwise something turned the pages possibly shared, which can result in random memory corruptions, which we really want to catch. With only the pinned pages at hand and not the actual page table entries we have to be a bit careful: hugetlb pages are always mapped via a single logical page table entry referencing the head page and PG_anon_exclusive of the head page applies. Anon THP are a bit more complicated, because we might have obtained the page reference either via a PMD or a PTE -- depending on the mapping type we either have to check PageAnonExclusive of the head page (PMD-mapped THP) or the tail page (PTE-mapped THP) applies: as we don't know and to make our life easier, check that either is set. Take care to not verify in case we're unpinning during GUP-fast because we detected concurrent fork(): we might stumble over an anonymous page that is now shared. Link: https://lkml.kernel.org/r/20220428083441.37290-18-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- mm/huge_memory.c | 3 +++ mm/hugetlb.c | 3 +++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 4d089d467e58..46ffd8c51c6e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -29,6 +29,39 @@ struct follow_page_context { unsigned int page_mask; }; +static inline void sanity_check_pinned_pages(struct page **pages, + unsigned long npages) +{ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + /* + * We only pin anonymous pages if they are exclusive. Once pinned, we + * can no longer turn them possibly shared and PageAnonExclusive() will + * stick around until the page is freed. + * + * We'd like to verify that our pinned anonymous pages are still mapped + * exclusively. The issue with anon THP is that we don't know how + * they are/were mapped when pinning them. However, for anon + * THP we can assume that either the given page (PTE-mapped THP) or + * the head page (PMD-mapped THP) should be PageAnonExclusive(). If + * neither is the case, there is certainly something wrong. + */ + for (; npages; npages--, pages++) { + struct page *page = *pages; + struct folio *folio = page_folio(page); + + if (!folio_test_anon(folio)) + continue; + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); + else + /* Either a PTE-mapped or a PMD-mapped THP. */ + VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && + !PageAnonExclusive(page), page); + } +} + /* * Return the folio with ref appropriately incremented, * or NULL if that failed. @@ -204,6 +237,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) */ void unpin_user_page(struct page *page) { + sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); @@ -272,6 +306,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, return; } + sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* @@ -344,6 +379,23 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); +static void unpin_user_pages_lockless(struct page **pages, unsigned long npages) +{ + unsigned long i; + struct folio *folio; + unsigned int nr; + + /* + * Don't perform any sanity checks because we might have raced with + * fork() and some anonymous pages might now actually be shared -- + * which is why we're unpinning after all. + */ + for (i = 0; i < npages; i += nr) { + folio = gup_folio_next(pages, npages, i, &nr); + gup_put_folio(folio, nr, FOLL_PIN); + } +} + /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. @@ -367,6 +419,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages) if (WARN_ON(IS_ERR_VALUE(npages))) return; + sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); @@ -510,6 +563,10 @@ retry: page = ERR_PTR(-EMLINK); goto out; } + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { page = ERR_PTR(-ENOMEM); @@ -2750,8 +2807,10 @@ static unsigned long lockless_pages_from_mm(unsigned long start, */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { - unpin_user_pages(pages, nr_pinned); + unpin_user_pages_lockless(pages, nr_pinned); return 0; + } else { + sanity_check_pinned_pages(pages, nr_pinned); } } return nr_pinned; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4414111998df..a2f44d8d3d47 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1392,6 +1392,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) return ERR_PTR(-EMLINK); + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + if (!try_grab_page(page, flags)) return ERR_PTR(-ENOMEM); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f8d3a77a2ec..4454c42a82f0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6118,6 +6118,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + /* * If subpage information not requested, update counters * and skip the same_page loop below. -- cgit v1.2.3 From 1493a1913e34b0ac366e33f9ebad721e69fd06ac Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm/swap: remember PG_anon_exclusive via a swp pte bit Patch series "mm: COW fixes part 3: reliable GUP R/W FOLL_GET of anonymous pages", v2. This series fixes memory corruptions when a GUP R/W reference (FOLL_WRITE | FOLL_GET) was taken on an anonymous page and COW logic fails to detect exclusivity of the page to then replacing the anonymous page by a copy in the page table: The GUP reference lost synchronicity with the pages mapped into the page tables. This series focuses on x86, arm64, s390x and ppc64/book3s -- other architectures are fairly easy to support by implementing __HAVE_ARCH_PTE_SWP_EXCLUSIVE. This primarily fixes the O_DIRECT memory corruptions that can happen on concurrent swapout, whereby we lose DMA reads to a page (modifying the user page by writing to it). O_DIRECT currently uses FOLL_GET for short-term (!FOLL_LONGTERM) DMA from/to a user page. In the long run, we want to convert it to properly use FOLL_PIN, and John is working on it, but that might take a while and might not be easy to backport. In the meantime, let's restore what used to work before we started modifying our COW logic: make R/W FOLL_GET references reliable as long as there is no fork() after GUP involved. This is just the natural follow-up of part 2, that will also further reduce "wrong COW" on the swapin path, for example, when we cannot remove a page from the swapcache due to concurrent writeback, or if we have two threads faulting on the same swapped-out page. Fixing O_DIRECT is just a nice side-product This issue, including other related COW issues, has been summarized in [3] under 2): " 2. Intra Process Memory Corruptions due to Wrong COW (FOLL_GET) It was discovered that we can create a memory corruption by reading a file via O_DIRECT to a part (e.g., first 512 bytes) of a page, concurrently writing to an unrelated part (e.g., last byte) of the same page, and concurrently write-protecting the page via clear_refs SOFTDIRTY tracking [6]. For the reproducer, the issue is that O_DIRECT grabs a reference of the target page (via FOLL_GET) and clear_refs write-protects the relevant page table entry. On successive write access to the page from the process itself, we wrongly COW the page when resolving the write fault, resulting in a loss of synchronicity and consequently a memory corruption. While some people might think that using clear_refs in this combination is a corner cases, it turns out to be a more generic problem unfortunately. For example, it was just recently discovered that we can similarly create a memory corruption without clear_refs, simply by concurrently swapping out the buffer pages [7]. Note that we nowadays even use the swap infrastructure in Linux without an actual swap disk/partition: the prime example is zram which is enabled as default under Fedora [10]. The root issue is that a write-fault on a page that has additional references results in a COW and thereby a loss of synchronicity and consequently a memory corruption if two parties believe they are referencing the same page. " We don't particularly care about R/O FOLL_GET references: they were never reliable and O_DIRECT doesn't expect to observe modifications from a page after DMA was started. Note that: * this only fixes the issue on x86, arm64, s390x and ppc64/book3s ("enterprise architectures"). Other architectures have to implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE to achieve the same. * this does *not * consider any kind of fork() after taking the reference: fork() after GUP never worked reliably with FOLL_GET. * Not losing PG_anon_exclusive during swapout was the last remaining piece. KSM already makes sure that there are no other references on a page before considering it for sharing. Page migration maintains PG_anon_exclusive and simply fails when there are additional references (freezing the refcount fails). Only swapout code dropped the PG_anon_exclusive flag because it requires more work to remember + restore it. With this series in place, most COW issues of [3] are fixed on said architectures. Other architectures can implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE fairly easily. [1] https://lkml.kernel.org/r/20220329160440.193848-1-david@redhat.com [2] https://lkml.kernel.org/r/20211217113049.23850-1-david@redhat.com [3] https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com This patch (of 8): Currently, we clear PG_anon_exclusive in try_to_unmap() and forget about it. We do this, to keep fork() logic on swap entries easy and efficient: for example, if we wouldn't clear it when unmapping, we'd have to lookup the page in the swapcache for each and every swap entry during fork() and clear PG_anon_exclusive if set. Instead, we want to store that information directly in the swap pte, protected by the page table lock, similarly to how we handle SWP_MIGRATION_READ_EXCLUSIVE for migration entries. However, for actual swap entries, we don't want to mess with the swap type (e.g., still one bit) because it overcomplicates swap code. In try_to_unmap(), we already reject to unmap in case the page might be pinned, because we must not lose PG_anon_exclusive on pinned pages ever. Checking if there are other unexpected references reliably *before* completely unmapping a page is unfortunately not really possible: THP heavily overcomplicate the situation. Once fully unmapped it's easier -- we, for example, make sure that there are no unexpected references *after* unmapping a page before starting writeback on that page. So, we currently might end up unmapping a page and clearing PG_anon_exclusive if that page has additional references, for example, due to a FOLL_GET. do_swap_page() has to re-determine if a page is exclusive, which will easily fail if there are other references on a page, most prominently GUP references via FOLL_GET. This can currently result in memory corruptions when taking a FOLL_GET | FOLL_WRITE reference on a page even when fork() is never involved: try_to_unmap() will succeed, and when refaulting the page, it cannot be marked exclusive and will get replaced by a copy in the page tables on the next write access, resulting in writes via the GUP reference to the page being lost. In an ideal world, everybody that uses GUP and wants to modify page content, such as O_DIRECT, would properly use FOLL_PIN. However, that conversion will take a while. It's easier to fix what used to work in the past (FOLL_GET | FOLL_WRITE) remembering PG_anon_exclusive. In addition, by remembering PG_anon_exclusive we can further reduce unnecessary COW in some cases, so it's the natural thing to do. So let's transfer the PG_anon_exclusive information to the swap pte and store it via an architecture-dependant pte bit; use that information when restoring the swap pte in do_swap_page() and unuse_pte(). During fork(), we simply have to clear the pte bit and are done. Of course, there is one corner case to handle: swap backends that don't support concurrent page modifications while the page is under writeback. Special case these, and drop the exclusive marker. Add a comment why that is just fine (also, reuse_swap_page() would have done the same in the past). In the future, we'll hopefully have all architectures support __HAVE_ARCH_PTE_SWP_EXCLUSIVE, such that we can get rid of the empty stubs and the define completely. Then, we can also convert SWP_MIGRATION_READ_EXCLUSIVE. For architectures it's fairly easy to support: either simply use a yet unused pte bit that can be used for swap entries, steal one from the arch type bits if they exceed 5, or steal one from the offset bits. Note: R/O FOLL_GET references were never really reliable, especially when taking one on a shared page and then writing to the page (e.g., GUP after fork()). FOLL_GET, including R/W references, were never really reliable once fork was involved (e.g., GUP before fork(), GUP during fork()). KSM steps back in case it stumbles over unexpected references and is, therefore, fine. [david@redhat.com: fix SWP_STABLE_WRITES test] Link: https://lkml.kernel.org/r/ac725bcb-313a-4fff-250a-68ba9a8f85fb@redhat.comLink: https://lkml.kernel.org/r/20220329164329.208407-1-david@redhat.com Link: https://lkml.kernel.org/r/20220329164329.208407-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Shakeel Butt Cc: John Hubbard Cc: Jason Gunthorpe Cc: Mike Kravetz Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox (Oracle) Cc: Jann Horn Cc: Michal Hocko Cc: Nadav Amit Cc: Rik van Riel Cc: Roman Gushchin Cc: Andrea Arcangeli Cc: Peter Xu Cc: Don Dutile Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jan Kara Cc: Liang Zhang Cc: Pedro Demarchi Gomes Cc: Oded Gabbay Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Gerald Schaefer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 29 ++++++++++++++++++++++++++ include/linux/swapops.h | 2 ++ mm/memory.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---- mm/rmap.c | 19 ++++++++++------- mm/swapfile.c | 13 +++++++++++- 5 files changed, 105 insertions(+), 13 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f4f4077b97aa..53750224e176 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1003,6 +1003,35 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * When replacing an anonymous page by a real (!non) swap entry, we clear + * PG_anon_exclusive from the page and instead remember whether the flag was + * set in the swp pte. During fork(), we have to mark the entry as !exclusive + * (possibly shared). On swapin, we use that information to restore + * PG_anon_exclusive, which is very helpful in cases where we might have + * additional (e.g., FOLL_GET) references on a page and wouldn't be able to + * detect exclusivity. + * + * These functions don't apply to non-swap entries (e.g., migration, hwpoison, + * ...). + */ +#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return false; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte; +} +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index e476a8fed537..d1b728904d4e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -26,6 +26,8 @@ /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { + if (pte_swp_exclusive(pte)) + pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) diff --git a/mm/memory.c b/mm/memory.c index a75040a47fcc..bca60092b4d5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -792,6 +792,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + /* Mark the swap entry as shared. */ + if (pte_swp_exclusive(*src_pte)) { + pte = pte_swp_clear_exclusive(*src_pte); + set_pte_at(src_mm, addr, src_pte, pte); + } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); @@ -3563,6 +3568,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct page *page = NULL, *swapcache; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; + bool exclusive = false; swp_entry_t entry; pte_t pte; int locked; @@ -3728,6 +3734,46 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + /* + * Check under PT lock (to protect against concurrent fork() sharing + * the swap entry concurrently) for certainly exclusive pages. + */ + if (!PageKsm(page)) { + /* + * Note that pte_swp_exclusive() == false for architectures + * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. + */ + exclusive = pte_swp_exclusive(vmf->orig_pte); + if (page != swapcache) { + /* + * We have a fresh page that is not exposed to the + * swapcache -> certainly exclusive. + */ + exclusive = true; + } else if (exclusive && PageWriteback(page) && + (swp_swap_info(entry)->flags & SWP_STABLE_WRITES)) { + /* + * This is tricky: not all swap backends support + * concurrent page modifications while under writeback. + * + * So if we stumble over such a page in the swapcache + * we must not set the page exclusive, otherwise we can + * map it writable without further checks and modify it + * while still under writeback. + * + * For these problematic swap backends, simply drop the + * exclusive marker: this is perfectly fine as we start + * writeback only if we fully unmapped the page and + * there are no unexpected references on the page after + * unmapping succeeded. After fully unmapped, no + * further GUP references (FOLL_GET and FOLL_PIN) can + * appear, so dropping the exclusive marker and mapping + * it only R/O is fine. + */ + exclusive = false; + } + } + /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it @@ -3742,11 +3788,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = mk_pte(page, vma->vm_page_prot); /* - * Same logic as in do_wp_page(); however, optimize for fresh pages - * that are certainly not shared because we just allocated them without - * exposing them to the swapcache. + * Same logic as in do_wp_page(); however, optimize for pages that are + * certainly not shared either because we just allocated them without + * exposing them to the swapcache or because the swap entry indicates + * exclusivity. */ - if (!PageKsm(page) && (page != swapcache || page_count(page) == 1)) { + if (!PageKsm(page) && (exclusive || page_count(page) == 1)) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; diff --git a/mm/rmap.c b/mm/rmap.c index f96cc7eb23ec..86ed2865210d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1698,14 +1698,15 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } /* - * Note: We *don't* remember yet if the page was mapped - * exclusively in the swap entry, so swapin code has - * to re-determine that manually and might detect the - * page as possibly shared, for example, if there are - * other references on the page or if the page is under - * writeback. We made sure that there are no GUP pins - * on the page that would rely on it, so for GUP pins - * this is fine. + * Note: We *don't* remember if the page was mapped + * exclusively in the swap pte if the architecture + * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In + * that case, swapin code has to re-determine that + * manually and might detect the page as possibly + * shared, for example, if there are other references on + * the page or if the page is under writeback. We made + * sure that there are no GUP pins on the page that + * would rely on it, so for GUP pins this is fine. */ if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -1716,6 +1717,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); + if (anon_exclusive) + swp_pte = pte_swp_mkexclusive(swp_pte); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) diff --git a/mm/swapfile.c b/mm/swapfile.c index a7847324d476..7279b2d2d71d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1804,7 +1804,18 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr, RMAP_NONE); + rmap_t rmap_flags = RMAP_NONE; + + /* + * See do_swap_page(): PageWriteback() would be problematic. + * However, we do a wait_on_page_writeback() just before this + * call and have the page locked. + */ + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (pte_swp_exclusive(*pte)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); -- cgit v1.2.3 From 210d1e8af42df0fc35aee953124798f743432fab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm/debug_vm_pgtable: add tests for __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's test that __HAVE_ARCH_PTE_SWP_EXCLUSIVE works as expected. Link: https://lkml.kernel.org/r/20220329164329.208407-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 30fd11a2ed32..1ab091f49fc0 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -837,6 +837,19 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) +{ +#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + + pr_debug("Validating PTE swap exclusive\n"); + pte = pte_swp_mkexclusive(pte); + WARN_ON(!pte_swp_exclusive(pte)); + pte = pte_swp_clear_exclusive(pte); + WARN_ON(pte_swp_exclusive(pte)); +#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ +} + static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; @@ -1295,6 +1308,8 @@ static int __init debug_vm_pgtable(void) pte_swap_soft_dirty_tests(&args); pmd_swap_soft_dirty_tests(&args); + pte_swap_exclusive_tests(&args); + pte_swap_tests(&args); pmd_swap_tests(&args); -- cgit v1.2.3 From 3e20889cfbee1e9716544a14c5d23be598412ddf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: x86/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's use bit 3 to remember PG_anon_exclusive in swap ptes. [david@redhat.com: fix 32-bit swap layout] Link: https://lkml.kernel.org/r/d875c292-46b3-f281-65ae-71d0b0c6f592@redhat.com Link: https://lkml.kernel.org/r/20220329164329.208407-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 4 +++- arch/x86/include/asm/pgtable_64_types.h | 5 +++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3563f4645fa1..1e09519af143 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1286,6 +1286,23 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } +#ifdef _PAGE_SWP_EXCLUSIVE +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); +} +#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 56d0399a0cd1..e479491da8d5 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -186,7 +186,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -203,6 +203,8 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * + * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 91ac10654570..70e360a2e5fb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -163,4 +163,9 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +/* + * We borrow bit 3 to remember PG_anon_exclusive. + */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PWT + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ -- cgit v1.2.3 From 570ef363509b031966ed669fa002c8441dff273c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: arm64/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's use one of the type bits: core-mm only supports 5, so there is no need to consume 6. Note that we might be able to reuse bit 1, but reusing bit 1 turned out problematic in the past for PROT_NONE handling; so let's play safe and use another bit. Link: https://lkml.kernel.org/r/20220329164329.208407-5-david@redhat.com Reviewed-by: Catalin Marinas Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable-prot.h | 1 + arch/arm64/include/asm/pgtable.h | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index b1e1b74d993c..62e0ebeed720 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -14,6 +14,7 @@ * Software defined PTE bits definition. */ #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ +#define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index dff2b483ea50..3278c6c0d873 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -402,6 +402,22 @@ static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); } +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & PTE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); +} + #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h @@ -909,12 +925,13 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, /* * Encode and decode a swap entry: * bits 0-1: present (must be zero) - * bits 2-7: swap type + * bits 2: remember PG_anon_exclusive + * bits 3-7: swap type * bits 8-57: swap offset * bit 58: PTE_PROT_NONE (must be zero) */ -#define __SWP_TYPE_SHIFT 2 -#define __SWP_TYPE_BITS 6 +#define __SWP_TYPE_SHIFT 3 +#define __SWP_TYPE_BITS 5 #define __SWP_OFFSET_BITS 50 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) -- cgit v1.2.3 From 8043d26c46596ef2e7875e48a9536fb6df020813 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: s390/pgtable: cleanup description of swp pte layout Bit 52 and bit 55 don't have to be zero: they only trigger a translation-specifiation exception if the PTE is marked as valid, which is not the case for swap ptes. Document which bits are used for what, and which ones are unused. Link: https://lkml.kernel.org/r/20220329164329.208407-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9df679152620..3982575bb586 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1712,18 +1712,17 @@ static inline int has_transparent_hugepage(void) /* * 64 bit swap entry format: * A page-table entry has some bits we have to treat in a special way. - * Bits 52 and bit 55 have to be zero, otherwise a specification - * exception will occur instead of a page translation exception. The - * specification exception has the bad habit not to store necessary - * information in the lowcore. - * Bits 54 and 63 are used to indicate the page type. + * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte + * as invalid. * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200 - * This leaves the bits 0-51 and bits 56-62 to store type and offset. - * We use the 5 bits from 57-61 for the type and the 52 bits from 0-51 - * for the offset. - * | offset |01100|type |00| + * | offset |X11XX|type |S0| * |0000000000111111111122222222223333333333444444444455|55555|55566|66| * |0123456789012345678901234567890123456789012345678901|23456|78901|23| + * + * Bits 0-51 store the offset. + * Bits 57-61 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 52, 55 and 56 (X) are unused. */ #define __SWP_OFFSET_MASK ((1UL << 52) - 1) -- cgit v1.2.3 From 92cd58bd2566de905b347025b9408a55134956b7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: s390/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's use bit 52, which is unused. Link: https://lkml.kernel.org/r/20220329164329.208407-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 3982575bb586..a397b072a580 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -181,6 +181,8 @@ static inline int is_module_addr(void *addr) #define _PAGE_SOFT_DIRTY 0x000 #endif +#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */ + /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \ _PAGE_YOUNG | _PAGE_SOFT_DIRTY) @@ -826,6 +828,22 @@ static inline int pmd_protnone(pmd_t pmd) } #endif +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE)); +} + static inline int pte_soft_dirty(pte_t pte) { return pte_val(pte) & _PAGE_SOFT_DIRTY; @@ -1715,14 +1733,15 @@ static inline int has_transparent_hugepage(void) * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte * as invalid. * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200 - * | offset |X11XX|type |S0| + * | offset |E11XX|type |S0| * |0000000000111111111122222222223333333333444444444455|55555|55566|66| * |0123456789012345678901234567890123456789012345678901|23456|78901|23| * * Bits 0-51 store the offset. + * Bit 52 (E) is used to remember PG_anon_exclusive. * Bits 57-61 store the type. * Bit 62 (S) is used for softdirty tracking. - * Bits 52, 55 and 56 (X) are unused. + * Bits 55 and 56 (X) are unused. */ #define __SWP_OFFSET_MASK ((1UL << 52) - 1) -- cgit v1.2.3 From 03ac1b71fca171315dbbc3f9318e3cd2a210541e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: powerpc/pgtable: remove _PAGE_BIT_SWAP_TYPE for book3s The swap type is simply stored in bits 0x1f of the swap pte. Let's simplify by just getting rid of _PAGE_BIT_SWAP_TYPE. It's not like that we can simply change it: _PAGE_SWP_SOFT_DIRTY would suddenly fall into _RPAGE_RSV1, which isn't possible and would make the BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY) angry. While at it, make it clearer which bit we're actually using for _PAGE_SWP_SOFT_DIRTY by just using the proper define and introduce and use SWP_TYPE_MASK. Link: https://lkml.kernel.org/r/20220329164329.208407-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 875730d5af40..8e98375d5c4a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -13,7 +13,6 @@ /* * Common bits between hash and Radix page table */ -#define _PAGE_BIT_SWAP_TYPE 0 #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ @@ -751,17 +750,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * Don't have overlapping bits with _PAGE_HPTEFLAGS \ * We filter HPTEFLAGS on set_pte. \ */ \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & SWP_TYPE_MASK); \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ } while (0) #define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ - & ((1UL << SWP_TYPE_BITS) - 1)) +#define SWP_TYPE_MASK ((1UL << SWP_TYPE_BITS) - 1) +#define __swp_type(x) ((x).val & SWP_TYPE_MASK) #define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ - | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) + (type) | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) /* * swp_entry_t must be independent of pte bits. We build a swp_entry_t from * swap type and offset we get from swap and convert that to pte to find a @@ -774,7 +772,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) +#define _PAGE_SWP_SOFT_DIRTY _PAGE_NON_IDEMPOTENT #else #define _PAGE_SWP_SOFT_DIRTY 0UL #endif /* CONFIG_MEM_SOFT_DIRTY */ -- cgit v1.2.3 From bff9beaa2e8039d42dd60c27d0f0e2c586a6cb6b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:47 -0700 Subject: powerpc/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE for book3s Right now, the last 5 bits (0x1f) of the swap entry are used for the type and the bit before that (0x20) is used for _PAGE_SWP_SOFT_DIRTY. We cannot use 0x40, as that collides with _RPAGE_RSV1 -- contained in _PAGE_HPTEFLAGS. The next candidate would be _RPAGE_SW3 (0x200) -- which is used for _PAGE_SOFT_DIRTY for !swp ptes. So let's just use _PAGE_SOFT_DIRTY for _PAGE_SWP_SOFT_DIRTY (to make it easier to grasp) and use 0x20 now for _PAGE_SWP_EXCLUSIVE. Link: https://lkml.kernel.org/r/20220329164329.208407-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8e98375d5c4a..eecff2036869 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -752,6 +752,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & SWP_TYPE_MASK); \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_EXCLUSIVE); \ } while (0) #define SWP_TYPE_BITS 5 @@ -772,11 +773,13 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY _PAGE_NON_IDEMPOTENT +#define _PAGE_SWP_SOFT_DIRTY _PAGE_SOFT_DIRTY #else #define _PAGE_SWP_SOFT_DIRTY 0UL #endif /* CONFIG_MEM_SOFT_DIRTY */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_NON_IDEMPOTENT + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { @@ -794,6 +797,22 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_EXCLUSIVE)); +} + static inline bool check_pte_access(unsigned long access, unsigned long ptev) { /* -- cgit v1.2.3 From 0768c8de1b74b0f177d5f16c00b1459e92837d26 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 9 May 2022 18:20:47 -0700 Subject: mm/gup: fix comments to pin_user_pages_*() pin_user_pages API forces FOLL_PIN in gup_flags, which means that the API requires struct page **pages to be provided (not NULL). However, the comment to pin_user_pages() clearly allows for passing in a NULL @pages argument. Remove the incorrect comments, and add WARN_ON_ONCE(!pages) calls to enforce the API. It has been independently spotted by Minchan Kim and confirmed with John Hubbard: https://lore.kernel.org/all/YgWA0ghrrzHONehH@google.com/ Link: https://lkml.kernel.org/r/20220422015839.1274328-1-yury.norov@gmail.com Signed-off-by: Yury Norov (NVIDIA) Reviewed-by: John Hubbard Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/gup.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 46ffd8c51c6e..2e07cff3b31b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2969,6 +2969,9 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + gup_flags |= FOLL_PIN; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -2991,6 +2994,9 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return 0; + + if (WARN_ON_ONCE(!pages)) + return 0; /* * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. @@ -3018,8 +3024,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. + * Should be at least nr_pages long. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and @@ -3042,6 +3047,9 @@ long pin_user_pages_remote(struct mm_struct *mm, if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + gup_flags |= FOLL_PIN; return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); @@ -3055,8 +3063,7 @@ EXPORT_SYMBOL(pin_user_pages_remote); * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. + * Should be at least nr_pages long. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @@ -3074,6 +3081,9 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + gup_flags |= FOLL_PIN; return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags); @@ -3092,6 +3102,9 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + gup_flags |= FOLL_PIN; return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } -- cgit v1.2.3 From 17de1e559cf1eb01d5d90afd3064d5a280060f6f Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Mon, 9 May 2022 18:20:47 -0700 Subject: selftests: clarify common error when running gup_test The gup_test binary will fail showing only the output of perror("open") in the case that /sys/kernel/debug/gup_test is not found. This will almost always be due to CONFIG_GUP_TEST not being set, which enables compilation of a kernel that provides this file. Add a short error message to clarify this failure and point the user to the solution. Link: https://lkml.kernel.org/r/20220502224942.995427-1-jsavitz@redhat.com Signed-off-by: Joel Savitz Cc: Shuah Khan Cc: Nico Pache Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/gup_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 593262555e18..6bb36ca71cb5 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -21,6 +21,8 @@ #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ +#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" + static unsigned long cmd = GUP_FAST_BENCHMARK; static int gup_fd, repeats = 1; static unsigned long size = 128 * MB; -- cgit v1.2.3 From 014bb1de4fc17d54907d54418126a9a9736f4aff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:47 -0700 Subject: mm: create new mm/swap.h header file Patch series "MM changes to improve swap-over-NFS support". Assorted improvements for swap-via-filesystem. This is a resend of these patches, rebased on current HEAD. The only substantial changes is that swap_dirty_folio has replaced swap_set_page_dirty. Currently swap-via-fs (SWP_FS_OPS) doesn't work for any filesystem. It has previously worked for NFS but that broke a few releases back. This series changes to use a new ->swap_rw rather than ->readpage and ->direct_IO. It also makes other improvements. There is a companion series already in linux-next which fixes various issues with NFS. Once both series land, a final patch is needed which changes NFS over to use ->swap_rw. This patch (of 10): Many functions declared in include/linux/swap.h are only used within mm/ Create a new "mm/swap.h" and move some of these declarations there. Remove the redundant 'extern' from the function declarations. [akpm@linux-foundation.org: mm/memory-failure.c needs mm/swap.h] Link: https://lkml.kernel.org/r/164859751830.29473.5309689752169286816.stgit@noble.brown Link: https://lkml.kernel.org/r/164859778120.29473.11725907882296224053.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Trond Myklebust Cc: Hugh Dickins Cc: Mel Gorman Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/swap.h | 121 ---------------------------------------------- mm/huge_memory.c | 1 + mm/madvise.c | 1 + mm/memcontrol.c | 1 + mm/memory-failure.c | 1 + mm/memory.c | 1 + mm/mincore.c | 1 + mm/page_alloc.c | 1 + mm/page_io.c | 1 + mm/shmem.c | 1 + mm/swap.h | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 1 + mm/swapfile.c | 1 + mm/util.c | 1 + mm/vmscan.c | 1 + mm/zswap.c | 2 + 16 files changed, 148 insertions(+), 121 deletions(-) create mode 100644 mm/swap.h diff --git a/include/linux/swap.h b/include/linux/swap.h index e6d70a4156e8..def67112dce4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -427,62 +427,19 @@ extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP -#include /* for bio_end_io_t */ - -/* linux/mm/page_io.c */ -extern int swap_readpage(struct page *page, bool do_poll); -extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern void end_swap_bio_write(struct bio *bio); -extern int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func); bool swap_dirty_folio(struct address_space *mapping, struct folio *folio); - int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); -/* linux/mm/swap_state.c */ -/* One swap address space for each 64M swap space */ -#define SWAP_ADDRESS_SPACE_SHIFT 14 -#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } -extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *page); -extern void *get_shadow_from_swap_cache(swp_entry_t entry); -extern int add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp, void **shadowp); -extern void __delete_from_swap_cache(struct page *page, - swp_entry_t entry, void *shadow); -extern void delete_from_swap_cache(struct page *); -extern void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end); -extern void free_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); -extern struct page *lookup_swap_cache(swp_entry_t entry, - struct vm_area_struct *vma, - unsigned long addr); -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); -extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr, - bool do_poll); -extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated); -extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); -extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); - /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -535,12 +492,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ - -static inline int swap_readpage(struct page *page, bool do_poll) -{ - return 0; -} - static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; @@ -555,11 +506,6 @@ static inline void put_swap_device(struct swap_info_struct *si) { } -static inline struct address_space *swap_address_space(swp_entry_t entry) -{ - return NULL; -} - #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL @@ -574,14 +520,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_cache(struct page *page) -{ -} - -static inline void show_swap_cache_info(void) -{ -} - /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) @@ -607,65 +545,6 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp) { } -static inline struct page *swap_cluster_readahead(swp_entry_t entry, - gfp_t gfp_mask, struct vm_fault *vmf) -{ - return NULL; -} - -static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, - struct vm_fault *vmf) -{ - return NULL; -} - -static inline int swap_writepage(struct page *p, struct writeback_control *wbc) -{ - return 0; -} - -static inline struct page *lookup_swap_cache(swp_entry_t swp, - struct vm_area_struct *vma, - unsigned long addr) -{ - return NULL; -} - -static inline -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) -{ - return find_get_page(mapping, index); -} - -static inline int add_to_swap(struct page *page) -{ - return 0; -} - -static inline void *get_shadow_from_swap_cache(swp_entry_t entry) -{ - return NULL; -} - -static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask, void **shadowp) -{ - return -1; -} - -static inline void __delete_from_swap_cache(struct page *page, - swp_entry_t entry, void *shadow) -{ -} - -static inline void delete_from_swap_cache(struct page *page) -{ -} - -static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) -{ -} static inline int page_swapcount(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f44d8d3d47..86eae6834db7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -39,6 +39,7 @@ #include #include #include "internal.h" +#include "swap.h" #define CREATE_TRACE_POINTS #include diff --git a/mm/madvise.c b/mm/madvise.c index c965fac7b13a..b19daedde0ea 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -35,6 +35,7 @@ #include #include "internal.h" +#include "swap.h" struct madvise_walk_private { struct mmu_gather *tlb; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d180ef985b17..235c85e17c2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -67,6 +67,7 @@ #include #include #include "slab.h" +#include "swap.h" #include diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9711c0eb70b0..1d117190c350 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -59,6 +59,7 @@ #include #include #include +#include "swap.h" #include "internal.h" #include "ras/ras_event.h" diff --git a/mm/memory.c b/mm/memory.c index bca60092b4d5..6e9428ae6d66 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -86,6 +86,7 @@ #include "pgalloc-track.h" #include "internal.h" +#include "swap.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. diff --git a/mm/mincore.c b/mm/mincore.c index 9122676b54d6..f4f627325e12 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -20,6 +20,7 @@ #include #include +#include "swap.h" static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f01c71e41bcf..e85a0dc22761 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -81,6 +81,7 @@ #include "internal.h" #include "shuffle.h" #include "page_reporting.h" +#include "swap.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; diff --git a/mm/page_io.c b/mm/page_io.c index 89fbf3cae30f..4b359e59bfdc 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,6 +26,7 @@ #include #include #include +#include "swap.h" void end_swap_bio_write(struct bio *bio) { diff --git a/mm/shmem.c b/mm/shmem.c index e504d734177b..913c0b039333 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include #include #include +#include "swap.h" static struct vfsmount *shm_mnt; diff --git a/mm/swap.h b/mm/swap.h new file mode 100644 index 000000000000..f8265bf0ce00 --- /dev/null +++ b/mm/swap.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_H +#define _MM_SWAP_H + +#ifdef CONFIG_SWAP +#include /* for bio_end_io_t */ + +/* linux/mm/page_io.c */ +int swap_readpage(struct page *page, bool do_poll); +int swap_writepage(struct page *page, struct writeback_control *wbc); +void end_swap_bio_write(struct bio *bio); +int __swap_writepage(struct page *page, struct writeback_control *wbc, + bio_end_io_t end_write_func); + +/* linux/mm/swap_state.c */ +/* One swap address space for each 64M swap space */ +#define SWAP_ADDRESS_SPACE_SHIFT 14 +#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +extern struct address_space *swapper_spaces[]; +#define swap_address_space(entry) \ + (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ + >> SWAP_ADDRESS_SPACE_SHIFT]) + +void show_swap_cache_info(void); +int add_to_swap(struct page *page); +void *get_shadow_from_swap_cache(swp_entry_t entry); +int add_to_swap_cache(struct page *page, swp_entry_t entry, + gfp_t gfp, void **shadowp); +void __delete_from_swap_cache(struct page *page, + swp_entry_t entry, void *shadow); +void delete_from_swap_cache(struct page *page); +void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end); +void free_swap_cache(struct page *page); +struct page *lookup_swap_cache(swp_entry_t entry, + struct vm_area_struct *vma, + unsigned long addr); +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); + +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, + bool do_poll); +struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, + bool *new_page_allocated); +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); +struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); + +#else /* CONFIG_SWAP */ +static inline int swap_readpage(struct page *page, bool do_poll) +{ + return 0; +} + +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return NULL; +} + +static inline void free_swap_cache(struct page *page) +{ +} + +static inline void show_swap_cache_info(void) +{ +} + +static inline struct page *swap_cluster_readahead(swp_entry_t entry, + gfp_t gfp_mask, struct vm_fault *vmf) +{ + return NULL; +} + +static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return NULL; +} + +static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +{ + return 0; +} + +static inline struct page *lookup_swap_cache(swp_entry_t swp, + struct vm_area_struct *vma, + unsigned long addr) +{ + return NULL; +} + +static inline +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + return find_get_page(mapping, index); +} + +static inline int add_to_swap(struct page *page) +{ + return 0; +} + +static inline void *get_shadow_from_swap_cache(swp_entry_t entry) +{ + return NULL; +} + +static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, + gfp_t gfp_mask, void **shadowp) +{ + return -1; +} + +static inline void __delete_from_swap_cache(struct page *page, + swp_entry_t entry, void *shadow) +{ +} + +static inline void delete_from_swap_cache(struct page *page) +{ +} + +static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end) +{ +} + +#endif /* CONFIG_SWAP */ +#endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 013856004825..5437dd317cf3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through diff --git a/mm/swapfile.c b/mm/swapfile.c index 7279b2d2d71d..f08bf675a613 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -44,6 +44,7 @@ #include #include #include +#include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); diff --git a/mm/util.c b/mm/util.c index 3492a9e81aa3..f27c796239b1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,7 @@ #include #include "internal.h" +#include "swap.h" /** * kfree_const - conditionally free memory diff --git a/mm/vmscan.c b/mm/vmscan.c index 6e255e2867d2..b1b91d5b9f49 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,6 +59,7 @@ #include #include "internal.h" +#include "swap.h" #define CREATE_TRACE_POINTS #include diff --git a/mm/zswap.c b/mm/zswap.c index 3efd8cae315e..2c5db4cbedea 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -36,6 +36,8 @@ #include #include +#include "swap.h" + /********************************* * statistics **********************************/ -- cgit v1.2.3 From 4c4a763406ef903b78334bd2ccea168d2f7a741a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:47 -0700 Subject: mm: drop swap_dirty_folio folios that are written to swap are owned by the MM subsystem - not any filesystem. When such a folio is passed to a filesystem to be written out to a swap-file, the filesystem handles the data, but the folio itself does not belong to the filesystem. So calling the filesystem's ->dirty_folio() address_space operation makes no sense. This is for folios in the given address space, and a folio to be written to swap does not exist in the given address space. So drop swap_dirty_folio() which calls the address-space's ->dirty_folio(), and always use noop_dirty_folio(), which is appropriate for folios being swapped out. Link: https://lkml.kernel.org/r/164859778123.29473.6900942583784889976.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/page_io.c | 17 ----------------- mm/swap_state.c | 2 +- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index def67112dce4..8170254c56b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -427,7 +427,6 @@ extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP -bool swap_dirty_folio(struct address_space *mapping, struct folio *folio); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, diff --git a/mm/page_io.c b/mm/page_io.c index 4b359e59bfdc..77fa0fb263f4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -385,20 +385,3 @@ out: delayacct_swapin_end(); return ret; } - -bool swap_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - struct swap_info_struct *sis = swp_swap_info(folio_swap_entry(folio)); - - if (data_race(sis->flags & SWP_FS_OPS)) { - const struct address_space_operations *aops; - - mapping = sis->swap_file->f_mapping; - aops = mapping->a_ops; - - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - return aops->dirty_folio(mapping, folio); - } else { - return noop_dirty_folio(mapping, folio); - } -} diff --git a/mm/swap_state.c b/mm/swap_state.c index 5437dd317cf3..f3ab01801629 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -31,7 +31,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .dirty_folio = swap_dirty_folio, + .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif -- cgit v1.2.3 From 4b60c0ff2f2021ab99b7fb9da63b7ed1579ef1d8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: move responsibility for setting SWP_FS_OPS to ->swap_activate If a filesystem wishes to handle all swap IO itself (via ->direct_IO and ->readpage), rather than just providing devices addresses for submit_bio(), SWP_FS_OPS must be set. Currently the protocol for setting this it to have ->swap_activate return zero. In that case SWP_FS_OPS is set, and add_swap_extent() is called for the entire file. This is a little clumsy as different return values for ->swap_activate have quite different meanings, and it makes it hard to search for which filesystems require SWP_FS_OPS to be set. So remove the special meaning of a zero return, and require the filesystem to set SWP_FS_OPS if it so desires, and to always call add_swap_extent() as required. Currently only NFS and CIFS return zero for add_swap_extent(). Link: https://lkml.kernel.org/r/164859778123.29473.17908205846599043598.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- fs/cifs/file.c | 3 ++- fs/nfs/file.c | 14 ++++++++++++-- include/linux/swap.h | 6 ++++++ mm/swapfile.c | 10 +++------- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d511a78383c3..14d29404c834 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4933,7 +4933,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis, * from reading or writing the file */ - return 0; + sis->flags |= SWP_FS_OPS; + return add_swap_extent(sis, 0, sis->max, 0); } static void cifs_swap_deactivate(struct file *file) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 150b7fa8f0a7..a3a073604070 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -483,6 +483,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, { unsigned long blocks; long long isize; + int ret; struct inode *inode = file_inode(file); struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; @@ -496,13 +497,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, return -EINVAL; } - *span = sis->pages; + ret = rpc_clnt_swap_activate(clnt); + if (ret) + return ret; + ret = add_swap_extent(sis, 0, sis->max, 0); + if (ret < 0) { + rpc_clnt_swap_deactivate(clnt); + return ret; + } + *span = sis->pages; if (cl->rpc_ops->enable_swap) cl->rpc_ops->enable_swap(inode); - return rpc_clnt_swap_activate(clnt); + sis->flags |= SWP_FS_OPS; + return ret; } static void nfs_swap_deactivate(struct file *file) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8170254c56b0..7daae5a4b3e1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -577,6 +577,12 @@ static inline swp_entry_t get_swap_page(struct page *page) return entry; } +static inline int add_swap_extent(struct swap_info_struct *sis, + unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + return -EINVAL; +} #endif /* CONFIG_SWAP */ #ifdef CONFIG_THP_SWAP diff --git a/mm/swapfile.c b/mm/swapfile.c index f08bf675a613..d811458c0a88 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2259,13 +2259,9 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); - if (ret >= 0) - sis->flags |= SWP_ACTIVATED; - if (!ret) { - sis->flags |= SWP_FS_OPS; - ret = add_swap_extent(sis, 0, sis->max, 0); - *span = sis->pages; - } + if (ret < 0) + return ret; + sis->flags |= SWP_ACTIVATED; return ret; } -- cgit v1.2.3 From d791ea676b66489ef6dabd04cd655f5c77426e40 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: reclaim mustn't enter FS for SWP_FS_OPS swap-space If swap-out is using filesystem operations (SWP_FS_OPS), then it is not safe to enter the FS for reclaim. So only down-grade the requirement for swap pages to __GFP_IO after checking that SWP_FS_OPS are not being used. This makes the calculation of "may_enter_fs" slightly more complex, so move it into a separate function. with that done, there is little value in maintaining the bool variable any more. So replace the may_enter_fs variable with a may_enter_fs() function. This removes any risk for the variable becoming out-of-date. Link: https://lkml.kernel.org/r/164859778124.29473.16176717935781721855.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swap.h | 8 ++++++++ mm/vmscan.c | 29 ++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index f8265bf0ce00..e19f185df5e2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -50,6 +50,10 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +static inline unsigned int page_swap_flags(struct page *page) +{ + return page_swap_info(page)->flags; +} #else /* CONFIG_SWAP */ static inline int swap_readpage(struct page *page, bool do_poll) { @@ -129,5 +133,9 @@ static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, { } +static inline unsigned int page_swap_flags(struct page *page) +{ + return 0; +} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b1b91d5b9f49..95ebcb5b3e12 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1502,6 +1502,22 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return nr_succeeded; } +static bool may_enter_fs(struct page *page, gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_FS) + return true; + if (!PageSwapCache(page) || !(gfp_mask & __GFP_IO)) + return false; + /* + * We can "enter_fs" for swap-cache with only __GFP_IO + * providing this isn't SWP_FS_OPS. + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + return !data_race(page_swap_flags(page) & SWP_FS_OPS); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1528,7 +1544,7 @@ retry: struct page *page; struct folio *folio; enum page_references references = PAGEREF_RECLAIM; - bool dirty, writeback, may_enter_fs; + bool dirty, writeback; unsigned int nr_pages; cond_resched(); @@ -1553,9 +1569,6 @@ retry: if (!sc->may_unmap && page_mapped(page)) goto keep_locked; - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1598,7 +1611,7 @@ retry: * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * - * Require may_enter_fs because we would wait on fs, which + * Require may_enter_fs() because we would wait on fs, which * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off @@ -1630,7 +1643,7 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || - !PageReclaim(page) || !may_enter_fs) { + !PageReclaim(page) || !may_enter_fs(page, sc->gfp_mask)) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then @@ -1720,8 +1733,6 @@ retry: goto activate_locked_split; } - may_enter_fs = true; - /* Adding to swap updated mapping */ mapping = page_mapping(page); } @@ -1792,7 +1803,7 @@ retry: if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; - if (!may_enter_fs) + if (!may_enter_fs(page, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; -- cgit v1.2.3 From e1209d3a7a67c281260ba9989621060ba7328b8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space swap currently uses ->readpage to read swap pages. This can only request one page at a time from the filesystem, which is not most efficient. swap uses ->direct_IO for writes which while this is adequate is an inappropriate over-loading. ->direct_IO may need to had handle allocate space for holes or other details that are not relevant for swap. So this patch introduces a new address_space operation: ->swap_rw. In this patch it is used for reads, and a subsequent patch will switch writes to use it. No filesystem yet supports ->swap_rw, but that is not a problem because no filesystem actually works with filesystem-based swap. Only two filesystems set SWP_FS_OPS: - cifs sets the flag, but ->direct_IO always fails so swap cannot work. - nfs sets the flag, but ->direct_IO calls generic_write_checks() which has failed on swap files for several releases. To ensure that a NULL ->swap_rw isn't called, ->activate_swap() for both NFS and cifs are changed to fail if ->swap_rw is not set. This can be removed if/when the function is added. Future patches will restore swap-over-NFS functionality. To submit an async read with ->swap_rw() we need to allocate a structure to hold the kiocb and other details. swap_readpage() cannot handle transient failure, so we create a mempool to provide the structures. Link: https://lkml.kernel.org/r/164859778125.29473.13430559328221330589.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- fs/cifs/file.c | 4 ++++ fs/nfs/file.c | 4 ++++ include/linux/fs.h | 1 + mm/page_io.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++----- mm/swap.h | 1 + mm/swapfile.c | 5 ++++ 6 files changed, 77 insertions(+), 6 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 14d29404c834..794e2cc23e3f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4905,6 +4905,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis, cifs_dbg(FYI, "swap activate\n"); + if (!swap_file->f_mapping->a_ops->swap_rw) + /* Cannot support swap */ + return -EINVAL; + spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a3a073604070..55bc8c8ca71a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -488,6 +488,10 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; + if (!file->f_mapping->a_ops->swap_rw) + /* Cannot support swap */ + return -EINVAL; + spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..dbc5d10328df 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -380,6 +380,7 @@ struct address_space_operations { int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; diff --git a/mm/page_io.c b/mm/page_io.c index 77fa0fb263f4..2724d6389104 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -235,6 +235,25 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) #define bio_associate_blkg_from_page(bio, page) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ +struct swap_iocb { + struct kiocb iocb; + struct bio_vec bvec; +}; +static mempool_t *sio_pool; + +int sio_pool_init(void) +{ + if (!sio_pool) { + mempool_t *pool = mempool_create_kmalloc_pool( + SWAP_CLUSTER_MAX, sizeof(struct swap_iocb)); + if (cmpxchg(&sio_pool, NULL, pool)) + mempool_destroy(pool); + } + if (!sio_pool) + return -ENOMEM; + return 0; +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { @@ -306,6 +325,48 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } +static void sio_read_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + struct page *page = sio->bvec.bv_page; + + if (ret != 0 && ret != PAGE_SIZE) { + SetPageError(page); + ClearPageUptodate(page); + pr_alert_ratelimited("Read-error on swap-device\n"); + } else { + SetPageUptodate(page); + count_vm_event(PSWPIN); + } + unlock_page(page); + mempool_free(sio, sio_pool); +} + +static int swap_readpage_fs(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iov_iter from; + struct swap_iocb *sio; + loff_t pos = page_file_offset(page); + int ret; + + sio = mempool_alloc(sio_pool, GFP_KERNEL); + init_sync_kiocb(&sio->iocb, swap_file); + sio->iocb.ki_pos = pos; + sio->iocb.ki_complete = sio_read_complete; + sio->bvec.bv_page = page; + sio->bvec.bv_len = PAGE_SIZE; + sio->bvec.bv_offset = 0; + + iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_read_complete(&sio->iocb, ret); + return ret; +} + int swap_readpage(struct page *page, bool synchronous) { struct bio *bio; @@ -334,12 +395,7 @@ int swap_readpage(struct page *page, bool synchronous) } if (data_race(sis->flags & SWP_FS_OPS)) { - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - - ret = mapping->a_ops->readpage(swap_file, page); - if (!ret) - count_vm_event(PSWPIN); + ret = swap_readpage_fs(page); goto out; } diff --git a/mm/swap.h b/mm/swap.h index e19f185df5e2..eafac80b18d9 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -6,6 +6,7 @@ #include /* for bio_end_io_t */ /* linux/mm/page_io.c */ +int sio_pool_init(void); int swap_readpage(struct page *page, bool do_poll); int swap_writepage(struct page *page, struct writeback_control *wbc); void end_swap_bio_write(struct bio *bio); diff --git a/mm/swapfile.c b/mm/swapfile.c index d811458c0a88..9398e915b36b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2262,6 +2262,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; + if ((sis->flags & SWP_FS_OPS) && + sio_pool_init() != 0) { + destroy_swap_extents(sis); + return -ENOMEM; + } return ret; } -- cgit v1.2.3 From eb79f3af9395fbe448e91b0940a6c395b7d06be4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: nfs: rename nfs_direct_IO and use as ->swap_rw The nfs_direct_IO() exists to support SWAP IO, but hasn't worked for a while. We now need a ->swap_rw function which behaves slightly differently, returning zero for success rather than a byte count. So modify nfs_direct_IO accordingly, rename it, and use it as the ->swap_rw function. Link: https://lkml.kernel.org/r/165119301493.15698.7491285551903597618.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: Geert Uytterhoeven (on Renesas RSK+RZA1 with 32 MiB of SDRAM) Cc: David Howells Cc: Hugh Dickins Cc: Mel Gorman Cc: Miaohe Lin Cc: Trond Myklebust Signed-off-by: Andrew Morton --- fs/nfs/direct.c | 23 ++++++++++------------- fs/nfs/file.c | 5 +---- include/linux/nfs_fs.h | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 11c566d8769f..4eb2a8380a28 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -153,28 +153,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, } /** - * nfs_direct_IO - NFS address space operation for direct I/O + * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block * @iter: I/O buffer * - * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, for most direct IO, we - * shunt off direct read and write requests before the VFS gets them, - * so this method is only ever called for swap. + * Perform IO to the swap-file. This is much like direct IO. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - - /* we only support swap file calling nfs_direct_IO */ - if (!IS_SWAPFILE(inode)) - return 0; + ssize_t ret; VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter, true); - return nfs_file_direct_write(iocb, iter, true); + ret = nfs_file_direct_read(iocb, iter, true); + else + ret = nfs_file_direct_write(iocb, iter, true); + if (ret < 0) + return ret; + return 0; } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 55bc8c8ca71a..0e7686a5d0de 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -488,10 +488,6 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; - if (!file->f_mapping->a_ops->swap_rw) - /* Cannot support swap */ - return -EINVAL; - spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; @@ -549,6 +545,7 @@ const struct address_space_operations nfs_file_aops = { .error_remove_page = generic_error_remove_page, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, }; /* diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b48b9259e02c..fd5543486a3f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -507,7 +507,7 @@ static inline const struct cred *nfs_file_cred(struct file *file) /* * linux/fs/nfs/direct.c */ -extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); +int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, bool swap); ssize_t nfs_file_direct_write(struct kiocb *iocb, -- cgit v1.2.3 From 7eadabc05d45ecedc0e8906d1db46bc8cfeb02af Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: perform async writes to SWP_FS_OPS swap-space using ->swap_rw This patch switches swap-out to SWP_FS_OPS swap-spaces to use ->swap_rw and makes the writes asynchronous, like they are for other swap spaces. To make it async we need to allocate the kiocb struct from a mempool. This may block, but won't block as long as waiting for the write to complete. At most it will wait for some previous swap IO to complete. Link: https://lkml.kernel.org/r/164859778126.29473.12399585304843922231.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_io.c | 98 +++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 2724d6389104..fafbda338f72 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -254,6 +254,57 @@ int sio_pool_init(void) return 0; } +static void sio_write_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + struct page *page = sio->bvec.bv_page; + + if (ret != PAGE_SIZE) { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * folio_rotate_reclaimable but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ + set_page_dirty(page); + ClearPageReclaim(page); + pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", + ret, page_file_offset(page)); + } else + count_vm_event(PSWPOUT); + end_page_writeback(page); + mempool_free(sio, sio_pool); +} + +static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) +{ + struct swap_iocb *sio; + struct swap_info_struct *sis = page_swap_info(page); + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iov_iter from; + int ret; + + set_page_writeback(page); + unlock_page(page); + sio = mempool_alloc(sio_pool, GFP_NOIO); + init_sync_kiocb(&sio->iocb, swap_file); + sio->iocb.ki_complete = sio_write_complete; + sio->iocb.ki_pos = page_file_offset(page); + sio->bvec.bv_page = page; + sio->bvec.bv_len = PAGE_SIZE; + sio->bvec.bv_offset = 0; + iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_write_complete(&sio->iocb, ret); + return ret; +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { @@ -262,46 +313,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (data_race(sis->flags & SWP_FS_OPS)) { - struct kiocb kiocb; - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - struct bio_vec bv = { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0 - }; - struct iov_iter from; - - iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE); - init_sync_kiocb(&kiocb, swap_file); - kiocb.ki_pos = page_file_offset(page); - - set_page_writeback(page); - unlock_page(page); - ret = mapping->a_ops->direct_IO(&kiocb, &from); - if (ret == PAGE_SIZE) { - count_vm_event(PSWPOUT); - ret = 0; - } else { - /* - * In the case of swap-over-nfs, this can be a - * temporary failure if the system has limited - * memory for allocating transmit buffers. - * Mark the page dirty and avoid - * folio_rotate_reclaimable but rate-limit the - * messages but do not flag PageError like - * the normal direct-to-bio case as it could - * be temporary. - */ - set_page_dirty(page); - ClearPageReclaim(page); - pr_err_ratelimited("Write error on dio swapfile (%llu)\n", - page_file_offset(page)); - } - end_page_writeback(page); - return ret; - } + /* + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + if (data_race(sis->flags & SWP_FS_OPS)) + return swap_writepage_fs(page, wbc); ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { -- cgit v1.2.3 From cba738f6490953e154d163b2980ad6d7f06307aa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: doc: update documentation for swap_activate and swap_rw This documentation for ->swap_activate() has been out-of-date for a long time. This patch updates it to match recent changes, and adds documentation for the associated ->swap_rw() Link: https://lkml.kernel.org/r/164859778126.29473.6778751233552859461.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 18 ++++++++++++------ Documentation/filesystems/vfs.rst | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index c26d854275a0..ef2efa1cb4ec 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -258,8 +258,9 @@ prototypes:: int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); - int (*swap_activate)(struct file *); + int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); locking rules: All except dirty_folio and freepage may block @@ -287,6 +288,7 @@ is_partially_uptodate: yes error_remove_page: yes swap_activate: no swap_deactivate: no +swap_rw: yes, unlocks ====================== ======================== ========= =============== ->write_begin(), ->write_end() and ->readpage() may be called from @@ -386,15 +388,19 @@ cleaned, or an error value if not. Note that in order to prevent the folio getting mapped back in and redirtied, it needs to be kept locked across the entire operation. -->swap_activate will be called with a non-zero argument on -files backing (non block device backed) swapfiles. A return value -of zero indicates success, in which case this file can be used for -backing swapspace. The swapspace operations will be proxied to the -address space operations. +->swap_activate() will be called to prepare the given file for swap. It +should perform any validation and preparation necessary to ensure that +writes can be performed with minimal memory allocation. It should call +add_swap_extent(), or the helper iomap_swapfile_activate(), and return +the number of extents added. If IO should be submitted through +->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted +directly to the block device ``sis->bdev``. ->swap_deactivate() will be called in the sys_swapoff() path after ->swap_activate() returned success. +->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate(). + file_lock_operations ==================== diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 794bd1a66bfb..d4bd213d215b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -749,8 +749,9 @@ cache in your filesystem. The following members are defined: size_t count); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page) (struct mapping *mapping, struct page *page); - int (*swap_activate)(struct file *); + int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; ``writepage`` @@ -952,15 +953,21 @@ cache in your filesystem. The following members are defined: unless you have them locked or reference counts increased. ``swap_activate`` - Called when swapon is used on a file to allocate space if - necessary and pin the block lookup information in memory. A - return value of zero indicates success, in which case this file - can be used to back swapspace. + + Called to prepare the given file for swap. It should perform + any validation and preparation necessary to ensure that writes + can be performed with minimal memory allocation. It should call + add_swap_extent(), or the helper iomap_swapfile_activate(), and + return the number of extents added. If IO should be submitted + through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will + be submitted directly to the block device ``sis->bdev``. ``swap_deactivate`` Called during swapoff on files where swap_activate was successful. +``swap_rw`` + Called to read or write swap pages when SWP_FS_OPS is set. The File Object =============== -- cgit v1.2.3 From 5169b844b7dd5934cd4f22ab66de0cc669abf0b0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: mm: submit multipage reads for SWP_FS_OPS swap-space swap_readpage() is given one page at a time, but may be called repeatedly in succession. For block-device swap-space, the blk_plug functionality allows the multiple pages to be combined together at lower layers. That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS are single page reads. With this patch we pass in a pointer-to-pointer when swap_readpage can store state between calls - much like the effect of blk_plug. After calling swap_readpage() some number of times, the state will be passed to swap_read_unplug() which can submit the combined request. Link: https://lkml.kernel.org/r/164859778127.29473.14059420492644907783.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/madvise.c | 8 +++-- mm/memory.c | 2 +- mm/page_io.c | 104 +++++++++++++++++++++++++++++++++++++------------------- mm/swap.h | 17 +++++++-- mm/swap_state.c | 20 +++++++---- 5 files changed, 104 insertions(+), 47 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index b19daedde0ea..4d6592488b51 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -198,6 +198,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, pte_t *orig_pte; struct vm_area_struct *vma = walk->private; unsigned long index; + struct swap_iocb *splug = NULL; if (pmd_none_or_trans_huge_or_clear_bad(pmd)) return 0; @@ -219,10 +220,11 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, continue; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index, false); + vma, index, false, &splug); if (page) put_page(page); } + swap_read_unplug(splug); return 0; } @@ -238,6 +240,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); struct page *page; + struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, page, end_index) { @@ -250,13 +253,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, - NULL, 0, false); + NULL, 0, false, &splug); if (page) put_page(page); rcu_read_lock(); } rcu_read_unlock(); + swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ } diff --git a/mm/memory.c b/mm/memory.c index 6e9428ae6d66..90212057a546 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3633,7 +3633,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* To provide entry to swap_readpage() */ set_page_private(page, entry.val); - swap_readpage(page, true); + swap_readpage(page, true, NULL); set_page_private(page, 0); } } else { diff --git a/mm/page_io.c b/mm/page_io.c index fafbda338f72..a63510fd6611 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -237,7 +237,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) struct swap_iocb { struct kiocb iocb; - struct bio_vec bvec; + struct bio_vec bvec[SWAP_CLUSTER_MAX]; + int pages; }; static mempool_t *sio_pool; @@ -257,7 +258,7 @@ int sio_pool_init(void) static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec.bv_page; + struct page *page = sio->bvec[0].bv_page; if (ret != PAGE_SIZE) { /* @@ -295,10 +296,10 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = page_file_offset(page); - sio->bvec.bv_page = page; - sio->bvec.bv_len = PAGE_SIZE; - sio->bvec.bv_offset = 0; - iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE); + sio->bvec[0].bv_page = page; + sio->bvec[0].bv_len = PAGE_SIZE; + sio->bvec[0].bv_offset = 0; + iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -346,46 +347,66 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, static void sio_read_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec.bv_page; + int p; - if (ret != 0 && ret != PAGE_SIZE) { - SetPageError(page); - ClearPageUptodate(page); - pr_alert_ratelimited("Read-error on swap-device\n"); + if (ret == PAGE_SIZE * sio->pages) { + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + + SetPageUptodate(page); + unlock_page(page); + } + count_vm_events(PSWPIN, sio->pages); } else { - SetPageUptodate(page); - count_vm_event(PSWPIN); + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + + SetPageError(page); + ClearPageUptodate(page); + unlock_page(page); + } + pr_alert_ratelimited("Read-error on swap-device\n"); } - unlock_page(page); mempool_free(sio, sio_pool); } -static int swap_readpage_fs(struct page *page) +static void swap_readpage_fs(struct page *page, + struct swap_iocb **plug) { struct swap_info_struct *sis = page_swap_info(page); - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - struct iov_iter from; - struct swap_iocb *sio; + struct swap_iocb *sio = NULL; loff_t pos = page_file_offset(page); - int ret; - - sio = mempool_alloc(sio_pool, GFP_KERNEL); - init_sync_kiocb(&sio->iocb, swap_file); - sio->iocb.ki_pos = pos; - sio->iocb.ki_complete = sio_read_complete; - sio->bvec.bv_page = page; - sio->bvec.bv_len = PAGE_SIZE; - sio->bvec.bv_offset = 0; - iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); - if (ret != -EIOCBQUEUED) - sio_read_complete(&sio->iocb, ret); - return ret; + if (plug) + sio = *plug; + if (sio) { + if (sio->iocb.ki_filp != sis->swap_file || + sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) { + swap_read_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_KERNEL); + init_sync_kiocb(&sio->iocb, sis->swap_file); + sio->iocb.ki_pos = pos; + sio->iocb.ki_complete = sio_read_complete; + sio->pages = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = PAGE_SIZE; + sio->bvec[sio->pages].bv_offset = 0; + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + swap_read_unplug(sio); + sio = NULL; + } + if (plug) + *plug = sio; } -int swap_readpage(struct page *page, bool synchronous) +int swap_readpage(struct page *page, bool synchronous, + struct swap_iocb **plug) { struct bio *bio; int ret = 0; @@ -413,7 +434,7 @@ int swap_readpage(struct page *page, bool synchronous) } if (data_race(sis->flags & SWP_FS_OPS)) { - ret = swap_readpage_fs(page); + swap_readpage_fs(page, plug); goto out; } @@ -459,3 +480,16 @@ out: delayacct_swapin_end(); return ret; } + +void __swap_read_unplug(struct swap_iocb *sio) +{ + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; + + iov_iter_bvec(&from, READ, sio->bvec, sio->pages, + PAGE_SIZE * sio->pages); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_read_complete(&sio->iocb, ret); +} diff --git a/mm/swap.h b/mm/swap.h index eafac80b18d9..0389ab147837 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -7,7 +7,15 @@ /* linux/mm/page_io.c */ int sio_pool_init(void); -int swap_readpage(struct page *page, bool do_poll); +struct swap_iocb; +int swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug); +void __swap_read_unplug(struct swap_iocb *plug); +static inline void swap_read_unplug(struct swap_iocb *plug) +{ + if (unlikely(plug)) + __swap_read_unplug(plug); +} int swap_writepage(struct page *page, struct writeback_control *wbc); void end_swap_bio_write(struct bio *bio); int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -41,7 +49,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, - bool do_poll); + bool do_poll, + struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, @@ -56,7 +65,9 @@ static inline unsigned int page_swap_flags(struct page *page) return page_swap_info(page)->flags; } #else /* CONFIG_SWAP */ -static inline int swap_readpage(struct page *page, bool do_poll) +struct swap_iocb; +static inline int swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug) { return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index f3ab01801629..d41746a572a2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -520,14 +520,16 @@ fail_unlock: * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, bool do_poll) + struct vm_area_struct *vma, + unsigned long addr, bool do_poll, + struct swap_iocb **plug) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage, do_poll); + swap_readpage(retpage, do_poll, plug); return retpage; } @@ -621,6 +623,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; + struct swap_iocb *splug = NULL; bool do_poll = true, page_allocated; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; @@ -647,7 +650,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!page) continue; if (page_allocated) { - swap_readpage(page, false); + swap_readpage(page, false, &splug); if (offset != entry_offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); @@ -656,10 +659,12 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, put_page(page); } blk_finish_plug(&plug); + swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); + /* The page was likely read above, so no need for plugging here */ + return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -790,6 +795,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) { struct blk_plug plug; + struct swap_iocb *splug = NULL; struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte, pentry; @@ -820,7 +826,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, if (!page) continue; if (page_allocated) { - swap_readpage(page, false); + swap_readpage(page, false, &splug); if (i != ra_info.offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); @@ -829,10 +835,12 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, put_page(page); } blk_finish_plug(&plug); + swap_read_unplug(splug); lru_add_drain(); skip: + /* The page was likely read above, so no need for plugging here */ return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - ra_info.win == 1); + ra_info.win == 1, NULL); } /** -- cgit v1.2.3 From 2282679fb20bf036a714ed49fadd0230c278a203 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: mm: submit multipage write for SWP_FS_OPS swap-space swap_writepage() is given one page at a time, but may be called repeatedly in succession. For block-device swapspace, the blk_plug functionality allows the multiple pages to be combined together at lower layers. That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS are single page reads. With this patch we pass a pointer-to-pointer via the wbc. swap_writepage can store state between calls - much like the pointer passed explicitly to swap_readpage. After calling swap_writepage() some number of times, the state will be passed to swap_write_unplug() which can submit the combined request. Link: https://lkml.kernel.org/r/164859778128.29473.5191868522654408537.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/writeback.h | 7 +++++ mm/page_io.c | 78 ++++++++++++++++++++++++++++++++++------------- mm/swap.h | 4 +++ mm/vmscan.c | 9 ++++-- 4 files changed, 74 insertions(+), 24 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec..32b35f21cb97 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -80,6 +80,13 @@ struct writeback_control { unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ + /* To enable batching of swap writes to non-block-device backends, + * "plug" can be set point to a 'struct swap_iocb *'. When all swap + * writes have been submitted, if with swap_iocb is not NULL, + * swap_write_unplug() should be called. + */ + struct swap_iocb **swap_plug; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ diff --git a/mm/page_io.c b/mm/page_io.c index a63510fd6611..c132511f521c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -259,8 +259,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); struct page *page = sio->bvec[0].bv_page; + int p; - if (ret != PAGE_SIZE) { + if (ret != PAGE_SIZE * sio->pages) { /* * In the case of swap-over-nfs, this can be a * temporary failure if the system has limited @@ -271,43 +272,63 @@ static void sio_write_complete(struct kiocb *iocb, long ret) * the normal direct-to-bio case as it could * be temporary. */ - set_page_dirty(page); - ClearPageReclaim(page); pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, page_file_offset(page)); + for (p = 0; p < sio->pages; p++) { + page = sio->bvec[p].bv_page; + set_page_dirty(page); + ClearPageReclaim(page); + } } else - count_vm_event(PSWPOUT); - end_page_writeback(page); + count_vm_events(PSWPOUT, sio->pages); + + for (p = 0; p < sio->pages; p++) + end_page_writeback(sio->bvec[p].bv_page); + mempool_free(sio, sio_pool); } static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) { - struct swap_iocb *sio; + struct swap_iocb *sio = NULL; struct swap_info_struct *sis = page_swap_info(page); struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - struct iov_iter from; - int ret; + loff_t pos = page_file_offset(page); set_page_writeback(page); unlock_page(page); - sio = mempool_alloc(sio_pool, GFP_NOIO); - init_sync_kiocb(&sio->iocb, swap_file); - sio->iocb.ki_complete = sio_write_complete; - sio->iocb.ki_pos = page_file_offset(page); - sio->bvec[0].bv_page = page; - sio->bvec[0].bv_len = PAGE_SIZE; - sio->bvec[0].bv_offset = 0; - iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); - if (ret != -EIOCBQUEUED) - sio_write_complete(&sio->iocb, ret); - return ret; + if (wbc->swap_plug) + sio = *wbc->swap_plug; + if (sio) { + if (sio->iocb.ki_filp != swap_file || + sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) { + swap_write_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_NOIO); + init_sync_kiocb(&sio->iocb, swap_file); + sio->iocb.ki_complete = sio_write_complete; + sio->iocb.ki_pos = pos; + sio->pages = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = PAGE_SIZE; + sio->bvec[sio->pages].bv_offset = 0; + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { + swap_write_unplug(sio); + sio = NULL; + } + if (wbc->swap_plug) + *wbc->swap_plug = sio; + + return 0; } int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func) + bio_end_io_t end_write_func) { struct bio *bio; int ret; @@ -344,6 +365,19 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } +void swap_write_unplug(struct swap_iocb *sio) +{ + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; + + iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, + PAGE_SIZE * sio->pages); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_write_complete(&sio->iocb, ret); +} + static void sio_read_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); diff --git a/mm/swap.h b/mm/swap.h index 0389ab147837..a6da8f612904 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -16,6 +16,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) if (unlikely(plug)) __swap_read_unplug(plug); } +void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); void end_swap_bio_write(struct bio *bio); int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -71,6 +72,9 @@ static inline int swap_readpage(struct page *page, bool do_poll, { return 0; } +static inline void swap_write_unplug(struct swap_iocb *sio) +{ +} static inline struct address_space *swap_address_space(swp_entry_t entry) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 95ebcb5b3e12..a9761b04564c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1156,7 +1156,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct folio *folio, struct address_space *mapping) +static pageout_t pageout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug) { /* * If the folio is dirty, only perform writeback if that write @@ -1201,6 +1202,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping) .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1, + .swap_plug = plug, }; folio_set_reclaim(folio); @@ -1533,6 +1535,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; + struct swap_iocb *plug = NULL; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1814,7 +1817,7 @@ retry: * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping)) { + switch (pageout(folio, mapping, &plug)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -1968,6 +1971,8 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + if (plug) + swap_write_unplug(plug); return nr_reclaimed; } -- cgit v1.2.3 From a1a0dfd56f97738c1974976309bbf38bb5a21132 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: mm: handle THP in swap_*page_fs() Pages passed to swap_readpage()/swap_writepage() are not necessarily all the same size - there may be transparent-huge-pages involves. The BIO paths of swap_*page() handle this correctly, but the SWP_FS_OPS path does not. So we need to use thp_size() to find the size, not just assume PAGE_SIZE, and we need to track the total length of the request, not just assume it is "page * PAGE_SIZE". Link: https://lkml.kernel.org/r/165119301488.15698.9457662928942765453.stgit@noble.brown Signed-off-by: NeilBrown Reported-by: Miaohe Lin Cc: Christoph Hellwig Cc: David Howells Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Signed-off-by: Andrew Morton --- mm/page_io.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index c132511f521c..d636a3531cad 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -239,6 +239,7 @@ struct swap_iocb { struct kiocb iocb; struct bio_vec bvec[SWAP_CLUSTER_MAX]; int pages; + int len; }; static mempool_t *sio_pool; @@ -261,7 +262,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) struct page *page = sio->bvec[0].bv_page; int p; - if (ret != PAGE_SIZE * sio->pages) { + if (ret != sio->len) { /* * In the case of swap-over-nfs, this can be a * temporary failure if the system has limited @@ -301,7 +302,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) sio = *wbc->swap_plug; if (sio) { if (sio->iocb.ki_filp != swap_file || - sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) { + sio->iocb.ki_pos + sio->len != pos) { swap_write_unplug(sio); sio = NULL; } @@ -312,10 +313,12 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = pos; sio->pages = 0; + sio->len = 0; } sio->bvec[sio->pages].bv_page = page; - sio->bvec[sio->pages].bv_len = PAGE_SIZE; + sio->bvec[sio->pages].bv_len = thp_size(page); sio->bvec[sio->pages].bv_offset = 0; + sio->len += thp_size(page); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { swap_write_unplug(sio); @@ -371,8 +374,7 @@ void swap_write_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, - PAGE_SIZE * sio->pages); + iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -383,7 +385,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); int p; - if (ret == PAGE_SIZE * sio->pages) { + if (ret == sio->len) { for (p = 0; p < sio->pages; p++) { struct page *page = sio->bvec[p].bv_page; @@ -415,7 +417,7 @@ static void swap_readpage_fs(struct page *page, sio = *plug; if (sio) { if (sio->iocb.ki_filp != sis->swap_file || - sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) { + sio->iocb.ki_pos + sio->len != pos) { swap_read_unplug(sio); sio = NULL; } @@ -426,10 +428,12 @@ static void swap_readpage_fs(struct page *page, sio->iocb.ki_pos = pos; sio->iocb.ki_complete = sio_read_complete; sio->pages = 0; + sio->len = 0; } sio->bvec[sio->pages].bv_page = page; - sio->bvec[sio->pages].bv_len = PAGE_SIZE; + sio->bvec[sio->pages].bv_len = thp_size(page); sio->bvec[sio->pages].bv_offset = 0; + sio->len += thp_size(page); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { swap_read_unplug(sio); @@ -521,8 +525,7 @@ void __swap_read_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, READ, sio->bvec, sio->pages, - PAGE_SIZE * sio->pages); + iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); -- cgit v1.2.3 From 6341a446a0e66355d729b663d7c8ca28ad6d1442 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: MM: handle THP in swap_*page_fs() - count_vm_events() We need to use count_swpout_vm_event() for sio_write_complete() to get correct counting. Note that THP swap in (if it ever happens) is current accounted 1 for each page, whether HUGE or normal. This is different from swap-out accounting. This patch should be squashed into MM: handle THP in swap_*page_fs() Link: https://lkml.kernel.org/r/165146948934.24404.5909750610552745025@noble.neil.brown.name Signed-off-by: NeilBrown Reported-by: Miaohe Lin Reviewed-by: Miaohe Lin Cc: Geert Uytterhoeven Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Yang Shi Cc: Huang Ying Signed-off-by: Andrew Morton --- mm/page_io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index d636a3531cad..1b8075ef3418 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -280,8 +280,10 @@ static void sio_write_complete(struct kiocb *iocb, long ret) set_page_dirty(page); ClearPageReclaim(page); } - } else - count_vm_events(PSWPOUT, sio->pages); + } else { + for (p = 0; p < sio->pages; p++) + count_swpout_vm_event(sio->bvec[p].bv_page); + } for (p = 0; p < sio->pages; p++) end_page_writeback(sio->bvec[p].bv_page); -- cgit v1.2.3 From a2ad63daa88b9d6846976fd2a0b5e4f5cfc58377 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: VFS: add FMODE_CAN_ODIRECT file flag Currently various places test if direct IO is possible on a file by checking for the existence of the direct_IO address space operation. This is a poor choice, as the direct_IO operation may not be used - it is only used if the generic_file_*_iter functions are called for direct IO and some filesystems - particularly NFS - don't do this. Instead, introduce a new f_mode flag: FMODE_CAN_ODIRECT and change the various places to check this (avoiding pointer dereferences). do_dentry_open() will set this flag if ->direct_IO is present, so filesystems do not need to be changed. NFS *is* changed, to set the flag explicitly and discard the direct_IO entry in the address_space_operations for files. Other filesystems which currently use noop_direct_IO could usefully be changed to set this flag instead. Link: https://lkml.kernel.org/r/164859778128.29473.15189737957277399416.stgit@noble.brown Reviewed-by: Christoph Hellwig Signed-off-by: NeilBrown Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- drivers/block/loop.c | 4 ++-- fs/fcntl.c | 9 ++++----- fs/nfs/file.c | 3 ++- fs/open.c | 9 ++++----- fs/overlayfs/file.c | 13 ++++--------- include/linux/fs.h | 3 +++ 6 files changed, 19 insertions(+), 22 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a58595f5ee2c..75ef2e177e1b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -186,8 +186,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) */ if (dio) { if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO) + !(lo->lo_offset & dio_align) && + (file->f_mode & FMODE_CAN_ODIRECT)) use_dio = true; else use_dio = false; diff --git a/fs/fcntl.c b/fs/fcntl.c index f15d885b9796..34a3faa4886d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ - if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) { - if (!filp->f_mapping || !filp->f_mapping->a_ops || - !filp->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if (!S_ISFIFO(inode->i_mode) && + (arg & O_DIRECT) && + !(filp->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0e7686a5d0de..bfb4b707b07e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; res = nfs_open(inode, filp); + if (res == 0) + filp->f_mode |= FMODE_CAN_ODIRECT; return res; } @@ -536,7 +538,6 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .releasepage = nfs_release_page, - .direct_IO = nfs_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, #endif diff --git a/fs/open.c b/fs/open.c index 1315253e0247..7b50d7a2f51d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -834,16 +834,15 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) + f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..9d69b4dbb8c4 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -82,11 +82,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags) if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; - if (flags & O_DIRECT) { - if (!file->f_mapping->a_ops || - !file->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (file->f_op->check_flags) { err = file->f_op->check_flags(flags); @@ -306,8 +303,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; old_cred = ovl_override_creds(file_inode(file)->i_sb); @@ -367,8 +363,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; if (!ovl_should_sync(OVL_FS(inode->i_sb))) diff --git a/include/linux/fs.h b/include/linux/fs.h index dbc5d10328df..b81cacc51d2f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -162,6 +162,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)0x200000) +/* File supports DIRECT IO */ +#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) -- cgit v1.2.3 From 4a18419f71cdf9155d2d2a6c79546f720978b990 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm/mprotect: use mmu_gather Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6. This patchset is intended to remove unnecessary TLB flushes during mprotect() syscalls. Once this patch-set make it through, similar and further optimizations for MADV_COLD and userfaultfd would be possible. Basically, there are 3 optimizations in this patch-set: 1. Use TLB batching infrastructure to batch flushes across VMAs and do better/fewer flushes. This would also be handy for later userfaultfd enhancements. 2. Avoid unnecessary TLB flushes. This optimization is the one that provides most of the performance benefits. Unlike previous versions, we now only avoid flushes that would not result in spurious page-faults. 3. Avoiding TLB flushes on change_huge_pmd() that are only needed to prevent the A/D bits from changing. Andrew asked for some benchmark numbers. I do not have an easy determinate macrobenchmark in which it is easy to show benefit. I therefore ran a microbenchmark: a loop that does the following on anonymous memory, just as a sanity check to see that time is saved by avoiding TLB flushes. The loop goes: mprotect(p, PAGE_SIZE, PROT_READ) mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE) *p = 0; // make the page writable The test was run in KVM guest with 1 or 2 threads (the second thread was busy-looping). I measured the time (cycles) of each operation: 1 thread 2 threads mmots +patch mmots +patch PROT_READ 3494 2725 (-22%) 8630 7788 (-10%) PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%) [ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ] The exact numbers are really meaningless, but the benefit is clear. There are 2 interesting results though. (1) PROT_READ is cheaper, while one can expect it not to be affected. This is presumably due to TLB miss that is saved (2) Without memory access (*p = 0), the speedup of the patch is even greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush. As a result both operations on the patched kernel take roughly ~1500 cycles (with either 1 or 2 threads), whereas on mmotm their cost is as high as presented in the table. This patch (of 3): change_pXX_range() currently does not use mmu_gather, but instead implements its own deferred TLB flushes scheme. This both complicates the code, as developers need to be aware of different invalidation schemes, and prevents opportunities to avoid TLB flushes or perform them in finer granularity. The use of mmu_gather for modified PTEs has benefits in various scenarios even if pages are not released. For instance, if only a single page needs to be flushed out of a range of many pages, only that page would be flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction can be used instead of 512 instructions (or a full TLB flush, which would Linux would actually use by default). mprotect() over multiple VMAs requires a single flush. Use mmu_gather in change_pXX_range(). As the pages are not released, only record the flushed range using tlb_flush_pXX_range(). Handle THP similarly and get rid of flush_cache_range() which becomes redundant since tlb_start_vma() calls it when needed. Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com Signed-off-by: Nadav Amit Acked-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Xu Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Signed-off-by: Andrew Morton --- fs/exec.c | 6 +++- include/linux/huge_mm.h | 5 +-- include/linux/mm.h | 5 +-- mm/huge_memory.c | 10 ++++-- mm/mempolicy.c | 9 ++++- mm/mprotect.c | 92 +++++++++++++++++++++++++++---------------------- mm/userfaultfd.c | 6 +++- 7 files changed, 82 insertions(+), 51 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index e3e55d5e0be1..14b4b3755580 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; + struct mmu_gather tlb; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + tlb_gather_mmu(&tlb, mm); + ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); + tlb_finish_mmu(&tlb); + if (ret) goto out_unlock; BUG_ON(prev != vma); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2999190adc22..9a26bd10e083 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -36,8 +36,9 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); -int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, - pgprot_t newprot, unsigned long cp_flags); +int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); diff --git a/include/linux/mm.h b/include/linux/mm.h index 5dc5005ccc9b..d63ba0e0e068 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1967,10 +1967,11 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +extern unsigned long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -extern int mprotect_fixup(struct vm_area_struct *vma, +extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86eae6834db7..8db17c042aed 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1709,8 +1709,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * or if prot_numa but THP migration is not supported * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ -int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, unsigned long cp_flags) +int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -1721,6 +1722,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + if (prot_numa && !thp_migration_supported()) return 1; @@ -1819,6 +1822,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); + + tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0288ffaea064..3934476fb708 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -104,6 +104,7 @@ #include #include +#include #include #include "internal.h" @@ -630,12 +631,18 @@ unlock: unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct mmu_gather tlb; int nr_updated; - nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); + tlb_gather_mmu(&tlb, vma->vm_mm); + + nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE, + MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + tlb_finish_mmu(&tlb); + return nr_updated; } #else diff --git a/mm/mprotect.c b/mm/mprotect.c index 56060acdabd3..420be0201118 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -33,12 +33,13 @@ #include #include #include +#include #include "internal.h" -static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot, - unsigned long cp_flags) +static unsigned long change_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; @@ -49,6 +50,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + tlb_change_page_size(tlb, PAGE_SIZE); + /* * Can be called with only the mmap_lock for reading by * prot_numa so we must check the pmd isn't constantly @@ -149,6 +152,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = pte_mkwrite(ptent); } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); pages++; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -234,9 +238,9 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) return 0; } -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, - pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, unsigned long cp_flags) +static inline unsigned long change_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; @@ -276,8 +280,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); } else { - int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, cp_flags); + /* + * change_huge_pmd() does not defer TLB flushes, + * so no need to propagate the tlb argument. + */ + int nr_ptes = change_huge_pmd(tlb, vma, pmd, + addr, newprot, cp_flags); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { @@ -291,8 +299,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } /* fall through, the trans huge pmd just split */ } - this_pages = change_pte_range(vma, pmd, addr, next, newprot, - cp_flags); + this_pages = change_pte_range(tlb, vma, pmd, addr, next, + newprot, cp_flags); pages += this_pages; next: cond_resched(); @@ -306,9 +314,9 @@ next: return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, - p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, unsigned long cp_flags) +static inline unsigned long change_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; @@ -319,16 +327,16 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - pages += change_pmd_range(vma, pud, addr, next, newprot, + pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, cp_flags); } while (pud++, addr = next, addr != end); return pages; } -static inline unsigned long change_p4d_range(struct vm_area_struct *vma, - pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, unsigned long cp_flags) +static inline unsigned long change_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; @@ -339,44 +347,40 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - pages += change_pud_range(vma, p4d, addr, next, newprot, + pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, cp_flags); } while (p4d++, addr = next, addr != end); return pages; } -static unsigned long change_protection_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, pgprot_t newprot, - unsigned long cp_flags) +static unsigned long change_protection_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - unsigned long start = addr; unsigned long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); - flush_cache_range(vma, addr, end); - inc_tlb_flush_pending(mm); + tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - pages += change_p4d_range(vma, pgd, addr, next, newprot, + pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, cp_flags); } while (pgd++, addr = next, addr != end); - /* Only flush the TLB if we actually modified any entries: */ - if (pages) - flush_tlb_range(vma, start, end); - dec_tlb_flush_pending(mm); + tlb_end_vma(tlb, vma); return pages; } -unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +unsigned long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { @@ -387,7 +391,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, + pages = change_protection_range(tlb, vma, start, end, newprot, cp_flags); return pages; @@ -421,8 +425,9 @@ static const struct mm_walk_ops prot_none_walk_ops = { }; int -mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) +mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, + struct vm_area_struct **pprev, unsigned long start, + unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -509,7 +514,7 @@ success: dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); - change_protection(vma, start, end, vma->vm_page_prot, + change_protection(tlb, vma, start, end, vma->vm_page_prot, dirty_accountable ? MM_CP_DIRTY_ACCT : 0); /* @@ -543,6 +548,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + struct mmu_gather tlb; start = untagged_addr(start); @@ -602,6 +608,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, else prev = vma->vm_prev; + tlb_gather_mmu(&tlb, current->mm); for (nstart = start ; ; ) { unsigned long mask_off_old_flags; unsigned long newflags; @@ -628,18 +635,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len, /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { error = -EACCES; - goto out; + break; } /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; - goto out; + break; } error = security_file_mprotect(vma, reqprot, prot); if (error) - goto out; + break; tmp = vma->vm_end; if (tmp > end) @@ -648,27 +655,28 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (vma->vm_ops && vma->vm_ops->mprotect) { error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); if (error) - goto out; + break; } - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); if (error) - goto out; + break; nstart = tmp; if (nstart < prev->vm_end) nstart = prev->vm_end; if (nstart >= end) - goto out; + break; vma = prev->vm_next; if (!vma || vma->vm_start != nstart) { error = -ENOMEM; - goto out; + break; } prot = reqprot; } + tlb_finish_mmu(&tlb); out: mmap_write_unlock(current->mm); return error; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dae25d985d15..9cdaed939203 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static __always_inline @@ -687,6 +688,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; + struct mmu_gather tlb; pgprot_t newprot; int err; @@ -728,8 +730,10 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, else newprot = vm_get_page_prot(dst_vma->vm_flags); - change_protection(dst_vma, start, start + len, newprot, + tlb_gather_mmu(&tlb, dst_mm); + change_protection(&tlb, dst_vma, start, start + len, newprot, enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + tlb_finish_mmu(&tlb); err = 0; out_unlock: -- cgit v1.2.3 From c9fe66560bf2dc7d109754414e309888cb8c9ba9 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm/mprotect: do not flush when not required architecturally Currently, using mprotect() to unprotect a memory region or uffd to unprotect a memory region causes a TLB flush. However, in such cases the PTE is often not modified (i.e., remain RO) and therefore not TLB flush is needed. Add an arch-specific pte_needs_flush() which tells whether a TLB flush is needed based on the old PTE and the new one. Implement an x86 pte_needs_flush(). Always flush the TLB when it is architecturally needed even when skipping a TLB flush might only result in a spurious page-faults by skipping the flush. Even with such conservative manner, we can in the future further refine the checks to test whether a PTE is present by only considering the architectural _PAGE_PRESENT flag instead of {pte|pmd}_preesnt(). For not be careful and use the latter. Link: https://lkml.kernel.org/r/20220401180821.1986781-3-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Cc: Andrew Cooper Cc: Peter Xu Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_types.h | 2 + arch/x86/include/asm/tlbflush.h | 97 ++++++++++++++++++++++++++++++++++++ include/asm-generic/tlb.h | 14 ++++++ mm/huge_memory.c | 9 ++-- mm/mprotect.c | 3 +- 5 files changed, 120 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 40497a9020c6..8668bc661026 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -110,9 +110,11 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) +#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) +#define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 98fa0a114074..4af5579c7ef7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -259,6 +259,103 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); +static inline bool pte_flags_need_flush(unsigned long oldflags, + unsigned long newflags, + bool ignore_access) +{ + /* + * Flags that require a flush when cleared but not when they are set. + * Only include flags that would not trigger spurious page-faults. + * Non-present entries are not cached. Hardware would set the + * dirty/access bit if needed without a fault. + */ + const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | + _PAGE_ACCESSED; + const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | + _PAGE_SOFTW3 | _PAGE_SOFTW4; + const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | + _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | + _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | + _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX; + unsigned long diff = oldflags ^ newflags; + + BUILD_BUG_ON(flush_on_clear & software_flags); + BUILD_BUG_ON(flush_on_clear & flush_on_change); + BUILD_BUG_ON(flush_on_change & software_flags); + + /* Ignore software flags */ + diff &= ~software_flags; + + if (ignore_access) + diff &= ~_PAGE_ACCESSED; + + /* + * Did any of the 'flush_on_clear' flags was clleared set from between + * 'oldflags' and 'newflags'? + */ + if (diff & oldflags & flush_on_clear) + return true; + + /* Flush on modified flags. */ + if (diff & flush_on_change) + return true; + + /* Ensure there are no flags that were left behind */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + (diff & ~(flush_on_clear | software_flags | flush_on_change))) { + VM_WARN_ON_ONCE(1); + return true; + } + + return false; +} + +/* + * pte_needs_flush() checks whether permissions were demoted and require a + * flush. It should only be used for userspace PTEs. + */ +static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) +{ + /* !PRESENT -> * ; no need for flush */ + if (!(pte_flags(oldpte) & _PAGE_PRESENT)) + return false; + + /* PFN changed ; needs flush */ + if (pte_pfn(oldpte) != pte_pfn(newpte)) + return true; + + /* + * check PTE flags; ignore access-bit; see comment in + * ptep_clear_flush_young(). + */ + return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte), + true); +} +#define pte_needs_flush pte_needs_flush + +/* + * huge_pmd_needs_flush() checks whether permissions were demoted and require a + * flush. It should only be used for userspace huge PMDs. + */ +static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) +{ + /* !PRESENT -> * ; no need for flush */ + if (!(pmd_flags(oldpmd) & _PAGE_PRESENT)) + return false; + + /* PFN changed ; needs flush */ + if (pmd_pfn(oldpmd) != pmd_pfn(newpmd)) + return true; + + /* + * check PMD flags; do not ignore access-bit; see + * pmdp_clear_flush_young(). + */ + return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd), + false); +} +#define huge_pmd_needs_flush huge_pmd_needs_flush + #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index eee6f7763a39..ff3e82553a76 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -658,6 +658,20 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, } while (0) #endif +#ifndef pte_needs_flush +static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) +{ + return true; +} +#endif + +#ifndef huge_pmd_needs_flush +static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) +{ + return true; +} +#endif + #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8db17c042aed..2befa9cfb46e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1715,7 +1715,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - pmd_t entry; + pmd_t oldpmd, entry; bool preserve_write; int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; @@ -1804,9 +1804,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = pmdp_invalidate(vma, addr, pmd); + oldpmd = pmdp_invalidate(vma, addr, pmd); - entry = pmd_modify(entry, newprot); + entry = pmd_modify(oldpmd, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); if (uffd_wp) { @@ -1823,7 +1823,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); + if (huge_pmd_needs_flush(oldpmd, entry)) + tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: diff --git a/mm/mprotect.c b/mm/mprotect.c index 420be0201118..20a46f21cca8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -152,7 +152,8 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, ptent = pte_mkwrite(ptent); } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); pages++; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3 From 4f83145721f362c2f4d312edc4755269a2069488 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm: avoid unnecessary flush on change_huge_pmd() Calls to change_protection_range() on THP can trigger, at least on x86, two TLB flushes for one page: one immediately, when pmdp_invalidate() is called by change_huge_pmd(), and then another one later (that can be batched) when change_protection_range() finishes. The first TLB flush is only necessary to prevent the dirty bit (and with a lesser importance the access bit) from changing while the PTE is modified. However, this is not necessary as the x86 CPUs set the dirty-bit atomically with an additional check that the PTE is (still) present. One caveat is Intel's Knights Landing that has a bug and does not do so. Leverage this behavior to eliminate the unnecessary TLB flush in change_huge_pmd(). Introduce a new arch specific pmdp_invalidate_ad() that only invalidates the access and dirty bit from further changes. Link: https://lkml.kernel.org/r/20220401180821.1986781-4-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Xu Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 5 +++++ arch/x86/mm/pgtable.c | 10 ++++++++++ include/linux/pgtable.h | 20 ++++++++++++++++++++ mm/huge_memory.c | 4 ++-- mm/pgtable-generic.c | 8 ++++++++ 5 files changed, 45 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e09519af143..0821f87d495f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1168,6 +1168,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif + +#define __HAVE_ARCH_PMDP_INVALIDATE_AD +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3481b35cb4ec..f16059e9a85e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -608,6 +608,16 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } + +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * No flush is necessary. Once an invalid PTE is established, the PTE's + * access and dirty bits cannot be updated. + */ + return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); +} #endif /** diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 53750224e176..530b1817b58c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -570,6 +570,26 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD + +/* + * pmdp_invalidate_ad() invalidates the PMD while changing a transparent + * hugepage mapping in the page tables. This function is similar to + * pmdp_invalidate(), but should only be used if the access and dirty bits would + * not be cleared by the software in the new PMD value. The function ensures + * that hardware changes of the access and dirty bits updates would not be lost. + * + * Doing so can allow in certain architectures to avoid a TLB flush in most + * cases. Yet, another TLB flush might be necessary later if the PMD update + * itself requires such flush (e.g., if protection was set to be stricter). Yet, + * even when a TLB flush is needed because of the update, the caller may be able + * to batch these TLB flushing operations, so fewer TLB flush operations are + * needed. + */ +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2befa9cfb46e..6f37f77eb48c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1801,10 +1801,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * The race makes MADV_DONTNEED miss the huge pmd and don't clear it * which may break userspace. * - * pmdp_invalidate() is required to make sure we don't miss + * pmdp_invalidate_ad() is required to make sure we don't miss * dirty/young flags set by hardware. */ - oldpmd = pmdp_invalidate(vma, addr, pmd); + oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); if (preserve_write) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 6523fda274e5..90ab721a12a8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -201,6 +201,14 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, } #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + return pmdp_invalidate(vma, address, pmdp); +} +#endif + #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) -- cgit v1.2.3 From 4b25f030ae69ba710eff587cabb4c57cb7e7a8a1 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Mon, 9 May 2022 18:20:50 -0700 Subject: hugetlbfs: fix hugetlbfs_statfs() locking After commit db71ef79b59b ("hugetlb: make free_huge_page irq safe"), the subpool lock should be locked with spin_lock_irq() and all call sites was modified as such, except for the ones in hugetlbfs_statfs(). Link: https://lkml.kernel.org/r/20220429202207.3045-1-almasrymina@google.com Fixes: db71ef79b59b ("hugetlb: make free_huge_page irq safe") Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd3a088db11d..591599829e2a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1048,12 +1048,12 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (sbinfo->spool) { long free_pages; - spin_lock(&sbinfo->spool->lock); + spin_lock_irq(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; - spin_unlock(&sbinfo->spool->lock); + spin_unlock_irq(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } -- cgit v1.2.3 From 3c81b3bb0a33e2b555edb8d7eb99a7ae4f17d8bb Mon Sep 17 00:00:00 2001 From: huangshaobo Date: Mon, 9 May 2022 18:20:51 -0700 Subject: kfence: enable check kfence canary on panic via boot param Out-of-bounds accesses that aren't caught by a guard page will result in corruption of canary memory. In pathological cases, where an object has certain alignment requirements, an out-of-bounds access might never be caught by the guard page. Such corruptions, however, are only detected on kfree() normally. If the bug causes the kernel to panic before kfree(), KFENCE has no opportunity to report the issue. Such corruptions may also indicate failing memory or other faults. To provide some more information in such cases, add the option to check canary bytes on panic. This might help narrow the search for the panic cause; but, due to only having the allocation stack trace, such reports are difficult to use to diagnose an issue alone. In most cases, such reports are inactionable, and is therefore an opt-in feature (disabled by default). [akpm@linux-foundation.org: add __read_mostly, per Marco] Link: https://lkml.kernel.org/r/20220425022456.44300-1-huangshaobo6@huawei.com Signed-off-by: huangshaobo Suggested-by: chenzefeng Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Xiaoming Ni Cc: Wangbing Cc: Jubin Zhong Signed-off-by: Andrew Morton --- mm/kfence/core.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 9b2b5f56f4ae..6e69986c3f0d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include @@ -99,6 +101,10 @@ module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644) static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); module_param_named(deferrable, kfence_deferrable, bool, 0444); +/* If true, check all canary bytes on panic. */ +static bool kfence_check_on_panic __read_mostly; +module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444); + /* The pool of pages used for guard pages and objects. */ char *__kfence_pool __read_mostly; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ @@ -727,6 +733,31 @@ static int __init kfence_debugfs_init(void) late_initcall(kfence_debugfs_init); +/* === Panic Notifier ====================================================== */ + +static void kfence_check_all_canary(void) +{ + int i; + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta = &kfence_metadata[i]; + + if (meta->state == KFENCE_OBJECT_ALLOCATED) + for_each_canary(meta, check_canary_byte); + } +} + +static int kfence_check_canary_callback(struct notifier_block *nb, + unsigned long reason, void *arg) +{ + kfence_check_all_canary(); + return NOTIFY_OK; +} + +static struct notifier_block kfence_check_canary_notifier = { + .notifier_call = kfence_check_canary_callback, +}; + /* === Allocation Gate Timer ================================================ */ static struct delayed_work kfence_timer; @@ -804,6 +835,9 @@ static void kfence_init_enable(void) else INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); + if (kfence_check_on_panic) + atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); + WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); -- cgit v1.2.3 From f38adfef7e6bfe681d6602b6668be31fe1310ed0 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 9 May 2022 18:20:51 -0700 Subject: mm/highmem: VM_BUG_ON() if offset + len > PAGE_SIZE Add VM_BUG_ON() bounds checking to make sure that, if "offset + len> PAGE_SIZE", memset() does not corrupt data in adjacent pages. Mainly to match all the similar functions in highmem.h. Link: https://lkml.kernel.org/r/20220426193020.8710-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Andrew Morton Cc: Ira Weiny Cc: Catalin Marinas Cc: "Matthew Wilcox (Oracle)" Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/highmem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 39bb9b47fa9c..67720bfd6ade 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -358,6 +358,8 @@ static inline void memcpy_to_page(struct page *page, size_t offset, static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); + + VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); -- cgit v1.2.3 From 152e56178ad72037bc6a2f08d4a608543bc67cdf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:51 -0700 Subject: mm/damon/core: add a function for damon_operations registration checks Patch series "mm/damon: allow users know which monitoring ops are available". DAMON users can configure it for vaious address spaces including virtual address spaces and the physical address space by setting its monitoring operations set with appropriate one for their purpose. However, there is no celan and simple way to know exactly which monitoring operations sets are available on the currently running kernel. This patchset adds functions for the purpose on DAMON's kernel API ('damon_is_registered_ops()') and sysfs interface ('avail_operations' file under each context directory). This patch (of 4): To know if a specific 'damon_operations' is registered, users need to check the kernel config or try 'damon_select_ops()' with the ops of the question, and then see if it successes. In the latter case, the user should also revert the change. To make the process simple and convenient, this commit adds a function for checking if a specific 'damon_operations' is registered or not. Link: https://lkml.kernel.org/r/20220426203843.45238-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220426203843.45238-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 24 +++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f23cbfa4248d..73ff0e2d2a4d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -509,6 +509,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); +bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); diff --git a/mm/damon/core.c b/mm/damon/core.c index 5fe42e47c57b..997cf7b17779 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -30,7 +30,7 @@ static DEFINE_MUTEX(damon_ops_lock); static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ -static bool damon_registered_ops_id(enum damon_ops_id id) +static bool __damon_is_registered_ops(enum damon_ops_id id) { struct damon_operations empty_ops = {}; @@ -39,6 +39,24 @@ static bool damon_registered_ops_id(enum damon_ops_id id) return true; } +/** + * damon_is_registered_ops() - Check if a given damon_operations is registered. + * @id: Id of the damon_operations to check if registered. + * + * Return: true if the ops is set, false otherwise. + */ +bool damon_is_registered_ops(enum damon_ops_id id) +{ + bool registered; + + if (id >= NR_DAMON_OPS) + return false; + mutex_lock(&damon_ops_lock); + registered = __damon_is_registered_ops(id); + mutex_unlock(&damon_ops_lock); + return registered; +} + /** * damon_register_ops() - Register a monitoring operations set to DAMON. * @ops: monitoring operations set to register. @@ -56,7 +74,7 @@ int damon_register_ops(struct damon_operations *ops) return -EINVAL; mutex_lock(&damon_ops_lock); /* Fail for already registered ops */ - if (damon_registered_ops_id(ops->id)) { + if (__damon_is_registered_ops(ops->id)) { err = -EINVAL; goto out; } @@ -84,7 +102,7 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return -EINVAL; mutex_lock(&damon_ops_lock); - if (!damon_registered_ops_id(id)) + if (!__damon_is_registered_ops(id)) err = -EINVAL; else ctx->ops = damon_registered_ops[id]; -- cgit v1.2.3 From 0f2cb5885771b31fd1c82a1293787d1e378eaab5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:51 -0700 Subject: mm/damon/sysfs: add a file for listing available monitoring ops DAMON programming interface users can know if specific monitoring ops set is registered or not using 'damon_is_registered_ops()', but there is no such method for the user space. To help the case, this commit adds a new DAMON sysfs file called 'avail_operations' under each context directory for listing available monitoring ops. Reading the file will list each registered monitoring ops on each line. Link: https://lkml.kernel.org/r/20220426203843.45238-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 48e434cd43d8..6ad6364780b8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1810,6 +1810,21 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context) kobject_put(&context->schemes->kobj); } +static ssize_t avail_operations_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + enum damon_ops_id id; + int len = 0; + + for (id = 0; id < NR_DAMON_OPS; id++) { + if (!damon_is_registered_ops(id)) + continue; + len += sysfs_emit_at(buf, len, "%s\n", + damon_sysfs_ops_strs[id]); + } + return len; +} + static ssize_t operations_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1840,10 +1855,14 @@ static void damon_sysfs_context_release(struct kobject *kobj) kfree(container_of(kobj, struct damon_sysfs_context, kobj)); } +static struct kobj_attribute damon_sysfs_context_avail_operations_attr = + __ATTR_RO_MODE(avail_operations, 0400); + static struct kobj_attribute damon_sysfs_context_operations_attr = __ATTR_RW_MODE(operations, 0600); static struct attribute *damon_sysfs_context_attrs[] = { + &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, NULL, }; -- cgit v1.2.3 From f893abbd6997f9a95815acfb84aa865f0c996373 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:51 -0700 Subject: selftets/damon/sysfs: test existence and permission of avail_operations This commit adds a selftest test case for ensuring the existence and the permission (read-only) of the 'avail_oprations' DAMON sysfs file. Link: https://lkml.kernel.org/r/20220426203843.45238-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 2e3ae77cb6db..89592c64462f 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -231,6 +231,7 @@ test_context() { context_dir=$1 ensure_dir "$context_dir" "exist" + ensure_file "$context_dir/avail_operations" "exit" 400 ensure_file "$context_dir/operations" "exist" 600 test_monitoring_attrs "$context_dir/monitoring_attrs" test_targets "$context_dir/targets" -- cgit v1.2.3 From 2fe60ec99ba1c2615804ebf52e4827aee5dd6316 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:51 -0700 Subject: Docs/{ABI,admin-guide}/damon: document 'avail_operations' sysfs file This commit updates the DAMON ABI and usage documents for the new sysfs file, 'avail_operations'. Link: https://lkml.kernel.org/r/20220426203843.45238-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 10 +++++++++- Documentation/admin-guide/mm/damon/usage.rst | 18 ++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 9e282065cbcf..d724b8a12228 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -40,6 +40,12 @@ Description: Writing a number 'N' to this file creates the number of directories for controlling each DAMON context named '0' to 'N-1' under the contexts/ directory. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//avail_operations +Date: Apr 2022 +Contact: SeongJae Park +Description: Reading this file returns the available monitoring operations + sets on the currently running kernel. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//operations Date: Mar 2022 Contact: SeongJae Park @@ -47,7 +53,9 @@ Description: Writing a keyword for a monitoring operations set ('vaddr' for virtual address spaces monitoring, and 'paddr' for the physical address space monitoring) to this file makes the context to use the operations set. Reading the file returns the keyword for - the operations set the context is set to use. + the operations set the context is set to use. Note that only + the operations sets that listed in 'avail_operations' file are + valid inputs. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/sample_us Date: Mar 2022 diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 592ea9a50881..af6ffaea567b 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -68,7 +68,7 @@ comma (","). :: │ kdamonds/nr_kdamonds │ │ 0/state,pid │ │ │ contexts/nr_contexts - │ │ │ │ 0/operations + │ │ │ │ 0/avail_operations,operations │ │ │ │ │ monitoring_attrs/ │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us │ │ │ │ │ │ nr_regions/min,max @@ -143,17 +143,23 @@ be written to the file. contexts// ------------- -In each context directory, one file (``operations``) and three directories -(``monitoring_attrs``, ``targets``, and ``schemes``) exist. +In each context directory, two files (``avail_operations`` and ``operations``) +and three directories (``monitoring_attrs``, ``targets``, and ``schemes``) +exist. DAMON supports multiple types of monitoring operations, including those for -virtual address space and the physical address space. You can set and get what -type of monitoring operations DAMON will use for the context by writing one of -below keywords to, and reading from the file. +virtual address space and the physical address space. You can get the list of +available monitoring operations set on the currently running kernel by reading +``avail_operations`` file. Based on the kernel configuration, the file will +list some or all of below keywords. - vaddr: Monitor virtual address spaces of specific processes - paddr: Monitor the physical address space of the system +You can set and get what type of monitoring operations DAMON will use for the +context by writing one of the keywords listed in ``avail_operations`` file and +reading from the ``operations`` file. + contexts//monitoring_attrs/ ------------------------------ -- cgit v1.2.3 From de6d01542a5c4f6fe6f0c65c14694b760f896acc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:52 -0700 Subject: mm/damon/vaddr: register a damon_operations for fixed virtual address ranges monitoring Patch series "support fixed virtual address ranges monitoring". The monitoring operations set for virtual address spaces automatically updates the monitoring target regions to cover entire mappings of the virtual address spaces as much as possible. Some users could have more information about their programs than kernel and therefore have interest in not entire regions but only specific regions. For such cases, the automatic monitoring target regions updates are only unnecessary overhead or distractions. This patchset adds supports for the use case on DAMON's kernel API (DAMON_OPS_FVADDR) and sysfs interface ('fvaddr' keyword for 'operations' sysfs file). This patch (of 3): The monitoring operations set for virtual address spaces automatically updates the monitoring target regions to cover entire mappings of the virtual address spaces as much as possible. Some users could have more information about their programs than kernel and therefore have interest in not entire regions but only specific regions. For such cases, the automatic monitoring target regions updates are only unnecessary overheads or distractions. For such cases, DAMON's API users can simply set the '->init()' and '->update()' of the DAMON context's '->ops' NULL, and set the target monitoring regions when creating the context. But, that would be a dirty hack. Worse yet, the hack is unavailable for DAMON user space interface users. To support the use case in a clean way that can easily exported to the user space, this commit adds another monitoring operations set called 'fvaddr', which is same to 'vaddr' but does not automatically update the monitoring regions. Instead, it will only respect the virtual address regions which have explicitly passed at the initial context creation. Note that this commit leave sysfs interface not supporting the feature yet. The support will be made in a following commit. Link: https://lkml.kernel.org/r/20220426231750.48822-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220426231750.48822-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ mm/damon/sysfs.c | 4 ++++ mm/damon/vaddr.c | 15 +++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 73ff0e2d2a4d..09a5d0d02c00 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -261,10 +261,13 @@ struct damos { * enum damon_ops_id - Identifier for each monitoring operations implementation * * @DAMON_OPS_VADDR: Monitoring operations for virtual address spaces + * @DAMON_OPS_FVADDR: Monitoring operations for only fixed ranges of virtual + * address spaces * @DAMON_OPS_PADDR: Monitoring operations for the physical address space */ enum damon_ops_id { DAMON_OPS_VADDR, + DAMON_OPS_FVADDR, DAMON_OPS_PADDR, NR_DAMON_OPS, }; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6ad6364780b8..719a286d378f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1694,6 +1694,7 @@ static struct kobj_type damon_sysfs_attrs_ktype = { /* This should match with enum damon_ops_id */ static const char * const damon_sysfs_ops_strs[] = { "vaddr", + "unsupported", /* fvaddr is not supported by sysfs yet */ "paddr", }; @@ -1843,6 +1844,9 @@ static ssize_t operations_store(struct kobject *kobj, for (id = 0; id < NR_DAMON_OPS; id++) { if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { + /* fvaddr is not supported by sysfs yet */ + if (id == DAMON_OPS_FVADDR) + return -EINVAL; context->ops_id = id; return count; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b2ec0aa1ff45..5ba82ab4943b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -753,8 +753,19 @@ static int __init damon_va_initcall(void) .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, }; - - return damon_register_ops(&ops); + /* ops for fixed virtual address ranges */ + struct damon_operations ops_fvaddr = ops; + int err; + + /* Don't set the monitoring target regions for the entire mapping */ + ops_fvaddr.id = DAMON_OPS_FVADDR; + ops_fvaddr.init = NULL; + ops_fvaddr.update = NULL; + + err = damon_register_ops(&ops); + if (err) + return err; + return damon_register_ops(&ops_fvaddr); }; subsys_initcall(damon_va_initcall); -- cgit v1.2.3 From b82434471cd2164988f8b4436983338ef224431d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:52 -0700 Subject: mm/damon/sysfs: support fixed virtual address ranges monitoring This commit makes DAMON sysfs interface to support the fixed virtual address ranges monitoring. After this commit, writing 'fvaddr' to the 'operations' DAMON sysfs file makes DAMON uses the monitoring operations set for fixed virtual address ranges, so that users can monitor accesses to only interested virtual address ranges. [sj@kernel.org: fix pid leak under fvaddr ops use case] Link: https://lkml.kernel.org/r/20220503220531.45913-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220426231750.48822-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 719a286d378f..f753bb405101 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1694,7 +1694,7 @@ static struct kobj_type damon_sysfs_attrs_ktype = { /* This should match with enum damon_ops_id */ static const char * const damon_sysfs_ops_strs[] = { "vaddr", - "unsupported", /* fvaddr is not supported by sysfs yet */ + "fvaddr", "paddr", }; @@ -1844,9 +1844,6 @@ static ssize_t operations_store(struct kobject *kobj, for (id = 0; id < NR_DAMON_OPS; id++) { if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { - /* fvaddr is not supported by sysfs yet */ - if (id == DAMON_OPS_FVADDR) - return -EINVAL; context->ops_id = id; return count; } @@ -2089,7 +2086,8 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (ctx->ops.id == DAMON_OPS_VADDR) + if (ctx->ops.id == DAMON_OPS_VADDR || + ctx->ops.id == DAMON_OPS_FVADDR) put_pid(t->pid); damon_destroy_target(t); } @@ -2136,7 +2134,8 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, damon_sysfs_destroy_targets(ctx); return -ENOMEM; } - if (ctx->ops.id == DAMON_OPS_VADDR) { + if (ctx->ops.id == DAMON_OPS_VADDR || + ctx->ops.id == DAMON_OPS_FVADDR) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) { damon_sysfs_destroy_targets(ctx); @@ -2206,7 +2205,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (ctx->ops.id != DAMON_OPS_VADDR) + if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR) return; mutex_lock(&ctx->kdamond_lock); -- cgit v1.2.3 From 915418088c977993933efaf7a704c0bcd6c2bf77 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:52 -0700 Subject: Docs/{ABI,admin-guide}/damon: update for fixed virtual address ranges monitoring This commit documents the user space support of the newly added monitoring operations set for fixed virtual address ranges monitoring, namely 'fvaddr', on the ABI and usage documents for DAMON. Link: https://lkml.kernel.org/r/20220426231750.48822-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 14 ++++++++------ Documentation/admin-guide/mm/damon/usage.rst | 14 +++++++++++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index d724b8a12228..fab97ea22569 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -50,12 +50,14 @@ What: /sys/kernel/mm/damon/admin/kdamonds//contexts//operations Date: Mar 2022 Contact: SeongJae Park Description: Writing a keyword for a monitoring operations set ('vaddr' for - virtual address spaces monitoring, and 'paddr' for the physical - address space monitoring) to this file makes the context to use - the operations set. Reading the file returns the keyword for - the operations set the context is set to use. Note that only - the operations sets that listed in 'avail_operations' file are - valid inputs. + virtual address spaces monitoring, 'fvaddr' for fixed virtual + address ranges monitoring, and 'paddr' for the physical address + space monitoring) to this file makes the context to use the + operations set. Reading the file returns the keyword for the + operations set the context is set to use. + + Note that only the operations sets that listed in + 'avail_operations' file are valid inputs. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/sample_us Date: Mar 2022 diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index af6ffaea567b..9c67311a79d8 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -154,8 +154,13 @@ available monitoring operations set on the currently running kernel by reading list some or all of below keywords. - vaddr: Monitor virtual address spaces of specific processes + - fvaddr: Monitor fixed virtual address ranges - paddr: Monitor the physical address space of the system +Please refer to :ref:`regions sysfs directory ` for detailed +differences between the operations sets in terms of the monitoring target +regions. + You can set and get what type of monitoring operations DAMON will use for the context by writing one of the keywords listed in ``avail_operations`` file and reading from the ``operations`` file. @@ -198,6 +203,8 @@ If you wrote ``vaddr`` to the ``contexts//operations``, each target should be a process. You can specify the process to DAMON by writing the pid of the process to the ``pid_target`` file. +.. _sysfs_regions: + targets//regions ------------------- @@ -208,9 +215,10 @@ can be covered. However, users could want to set the initial monitoring region to specific address ranges. In contrast, DAMON do not automatically sets and updates the monitoring target -regions when ``paddr`` monitoring operations set is being used (``paddr`` is -written to the ``contexts//operations``). Therefore, users should set the -monitoring target regions by themselves in the case. +regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used +(``fvaddr`` or ``paddr`` have written to the ``contexts//operations``). +Therefore, users should set the monitoring target regions by themselves in the +cases. For such cases, users can explicitly set the initial monitoring target regions as they want, by writing proper values to the files under this directory. -- cgit v1.2.3 From 6366238b8dfc383723c211c1ffe8c8d7914107e5 Mon Sep 17 00:00:00 2001 From: liusongtang Date: Mon, 9 May 2022 18:20:52 -0700 Subject: mm/memory_hotplug: use pgprot_val to get value of pgprot pgprot.pgprot is non-portable code. It should be replaced by portable macro pgprot_val. Link: https://lkml.kernel.org/r/20220426071302.220646-1-liusongtang@huawei.com Signed-off-by: liusongtang Reviewed-by: Anshuman Khandual Cc: Xiaoming Ni Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 19a6beb9d4b6..e99fd60548f5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -303,7 +303,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, int err; struct vmem_altmap *altmap = params->altmap; - if (WARN_ON_ONCE(!params->pgprot.pgprot)) + if (WARN_ON_ONCE(!pgprot_val(params->pgprot))) return -EINVAL; VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); -- cgit v1.2.3 From 3d0b95cd87b26b0b10e0cda8ee6105c2194a5800 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 9 May 2022 18:20:52 -0700 Subject: mm: hugetlb: considering PMD sharing when flushing cache/TLBs This patchset fixes some cache flushing issues if PMD sharing is possible for hugetlb pages, which were found by code inspection. Meanwhile Mike found the flush_cache_page() can not cover the whole size of a hugetlb page on some architectures [1], so I added a new patch 3 to fix this issue, since I found only try_to_unmap_one() and try_to_migrate_one() need to fix after some investigation. [1] https://lore.kernel.org/linux-mm/064da3bb-5b4b-7332-a722-c5a541128705@oracle.com/ This patch (of 3): When moving hugetlb page tables, the cache flushing is called in move_page_tables() without considering the shared PMDs, which may be cause cache issues on some architectures. Thus we should move the hugetlb cache flushing into move_hugetlb_page_tables() with considering the shared PMDs ranges, calculated by adjust_range_if_pmd_sharing_possible(). Meanwhile also expanding the TLBs flushing range in case of shared PMDs. Note this is discovered via code inspection, and did not meet a real problem in practice so far. Link: https://lkml.kernel.org/r/cover.1651056365.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/0443c8cf20db554d3ff4b439b30e0ff26c0181dd.1651056365.git.baolin.wang@linux.alibaba.com Fixes: 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Mina Almasry Signed-off-by: Andrew Morton --- mm/hugetlb.c | 17 +++++++++++++++-- mm/mremap.c | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4454c42a82f0..fc31a8f49067 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4922,10 +4922,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long old_addr_copy; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; + bool shared_pmd = false; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + /* + * In case of shared PMDs, we should cover the maximum possible + * range. + */ + flush_cache_range(vma, range.start, range.end); + mmu_notifier_invalidate_range_start(&range); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); @@ -4942,8 +4949,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, */ old_addr_copy = old_addr; - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) + if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + shared_pmd = true; continue; + } dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) @@ -4951,7 +4960,11 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); } - flush_tlb_range(vma, old_end - len, old_end); + + if (shared_pmd) + flush_tlb_range(vma, range.start, range.end); + else + flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); diff --git a/mm/mremap.c b/mm/mremap.c index 98f50e633009..097002506d2d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -490,12 +490,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, return 0; old_end = old_addr + len; - flush_cache_range(vma, old_addr, old_end); if (is_vm_hugetlb_page(vma)) return move_hugetlb_page_tables(vma, new_vma, old_addr, new_addr, len); + flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); -- cgit v1.2.3 From 54205e9c5425049aef1bc7a812f890f00b5f79c7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mm: rmap: move the cache flushing to the correct place for hugetlb PMD sharing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cache level flush will always be first when changing an existing virtual–>physical mapping to a new value, since this allows us to properly handle systems whose caches are strict and require a virtual–>physical translation to exist for a virtual address. So we should move the cache flushing before huge_pmd_unshare(). As Muchun pointed out[1], now the architectures whose supporting hugetlb PMD sharing have no cache flush issues in practice. But I think we should still follow the cache/TLB flushing rules when changing a valid virtual address mapping in case of potential issues in future. [1] https://lore.kernel.org/all/YmT%2F%2FhuUbFX+KHcy@FVFYT0MHHV2J.usts.net/ Link: https://lkml.kernel.org/r/4f7ae6dfdc838ab71e1655188b657c032ff1f28f.1651056365.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 86ed2865210d..829b2f9486c6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1532,15 +1532,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + /* + * huge_pmd_unshare may unmap an entire PMD page. + * There is no way of knowing exactly which PMDs may + * be cached for this mm, so we must flush them all. + * start/end were already adjusted above to cover this + * range. + */ + flush_cache_range(vma, range.start, range.end); + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - /* - * huge_pmd_unshare unmapped an entire PMD - * page. There is no way of knowing exactly - * which PMDs may be cached for this mm, so - * we must flush them all. start/end were - * already adjusted above to cover this range. - */ - flush_cache_range(vma, range.start, range.end); flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -1557,13 +1558,14 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } + } else { + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); } /* * Nuke the page table entry. When having to clear * PageAnonExclusive(), we always have to flush. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially @@ -1884,15 +1886,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + /* + * huge_pmd_unshare may unmap an entire PMD page. + * There is no way of knowing exactly which PMDs may + * be cached for this mm, so we must flush them all. + * start/end were already adjusted above to cover this + * range. + */ + flush_cache_range(vma, range.start, range.end); + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - /* - * huge_pmd_unshare unmapped an entire PMD - * page. There is no way of knowing exactly - * which PMDs may be cached for this mm, so - * we must flush them all. start/end were - * already adjusted above to cover this range. - */ - flush_cache_range(vma, range.start, range.end); flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -1909,10 +1912,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } + } else { + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); } /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ -- cgit v1.2.3 From dfc7ab57560da385f705b28e2bf50e3b90444a6b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mm: rmap: use flush_cache_range() to flush cache for hugetlb pages Now we will use flush_cache_page() to flush cache for anonymous hugetlb pages when unmapping or migrating a hugetlb page mapping, but the flush_cache_page() only handles a PAGE_SIZE range on some architectures (like arm32, arc and so on), which will cause potential cache issues. Thus change to use flush_cache_range() to cover the whole size of a hugetlb page. Link: https://lkml.kernel.org/r/dc903b378d1e2d26bbbe85409ab9d009631f175c.1651056365.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 90 ++++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 829b2f9486c6..f73a62c5b180 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1525,13 +1525,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - /* - * To call huge_pmd_unshare, i_mmap_rwsem must be - * held in write mode. Caller needs to explicitly - * do this outside rmap routines. - */ - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (folio_test_hugetlb(folio)) { /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may @@ -1541,22 +1535,31 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ flush_cache_range(vma, range.start, range.end); - if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - + if (!folio_test_anon(folio)) { /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. */ - page_vma_mapped_walk_done(&pvmw); - break; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } } } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); @@ -1879,13 +1882,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - /* - * To call huge_pmd_unshare, i_mmap_rwsem must be - * held in write mode. Caller needs to explicitly - * do this outside rmap routines. - */ - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (folio_test_hugetlb(folio)) { /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may @@ -1895,22 +1892,31 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ flush_cache_range(vma, range.start, range.end); - if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - + if (!folio_test_anon(folio)) { /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. */ - page_vma_mapped_walk_done(&pvmw); - break; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } } } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); -- cgit v1.2.3 From 534aa1dc975ac883ad89110534585a96630802a0 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 9 May 2022 18:20:53 -0700 Subject: printk: stop including cache.h from printk.h An inclusion of cache.h in printk.h was added in 2014 in commit c28aa1f0a847 ("printk/cache: mark printk_once test variable __read_mostly") in order to bring in the definition of __read_mostly. The usage of __read_mostly was later removed in commit 3ec25826ae33 ("printk: Tie printk_once / printk_deferred_once into .data.once for reset") which made the inclusion of cache.h unnecessary, so remove it. We have a small amount of code that depended on the inclusion of cache.h from printk.h; fix that code to include the appropriate header. This fixes a circular inclusion on arm64 (linux/printk.h -> linux/cache.h -> asm/cache.h -> linux/kasan-enabled.h -> linux/static_key.h -> linux/jump_label.h -> linux/bug.h -> asm/bug.h -> linux/printk.h) that would otherwise be introduced by the next patch. Build tested using {allyesconfig,defconfig} x {arm64,x86_64}. Link: https://linux-review.googlesource.com/id/I8fd51f72c9ef1f2d6afd3b2cbc875aa4792c1fba Link: https://lkml.kernel.org/r/20220427195820.1716975-1-pcc@google.com Signed-off-by: Peter Collingbourne Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric W. Biederman Cc: Herbert Xu Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arm64/include/asm/mte-kasan.h | 1 + arch/arm64/include/asm/percpu.h | 1 + arch/csky/include/asm/processor.h | 2 +- drivers/firmware/smccc/kvm_guest.c | 1 + include/linux/printk.h | 1 - kernel/bpf/bpf_lru_list.h | 1 + 6 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index a857bcacf0fe..9f79425fc65a 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -6,6 +6,7 @@ #define __ASM_MTE_KASAN_H #include +#include #include #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 8f1661603b78..b9ba19dbdb69 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -10,6 +10,7 @@ #include #include #include +#include static inline void set_my_cpu_offset(unsigned long off) { diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 688c7548b559..9638206bc44f 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -4,9 +4,9 @@ #define __ASM_CSKY_PROCESSOR_H #include +#include #include #include -#include #include #include #include diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index 2d3e866decaa..89a68e7eeaa6 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -4,6 +4,7 @@ #include #include +#include #include #include diff --git a/include/linux/printk.h b/include/linux/printk.h index 1522df223c0f..8e8d74edf121 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 6b12f06ee18c..4ea227c9c1ad 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -4,6 +4,7 @@ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ +#include #include #include -- cgit v1.2.3 From d949a8155d139aa890795b802004a196b7f00598 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mm: make minimum slab alignment a runtime property When CONFIG_KASAN_HW_TAGS is enabled we currently increase the minimum slab alignment to 16. This happens even if MTE is not supported in hardware or disabled via kasan=off, which creates an unnecessary memory overhead in those cases. Eliminate this overhead by making the minimum slab alignment a runtime property and only aligning to 16 if KASAN is enabled at runtime. On a DragonBoard 845c (non-MTE hardware) with a kernel built with CONFIG_KASAN_HW_TAGS, waiting for quiescence after a full Android boot I see the following Slab measurements in /proc/meminfo (median of 3 reboots): Before: 169020 kB After: 167304 kB [akpm@linux-foundation.org: make slab alignment type `unsigned int' to avoid casting] Link: https://linux-review.googlesource.com/id/I752e725179b43b144153f4b6f584ceb646473ead Link: https://lkml.kernel.org/r/20220427195820.1716975-2-pcc@google.com Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Reviewed-by: Catalin Marinas Acked-by: Vlastimil Babka Cc: Pekka Enberg Cc: Roman Gushchin Cc: Joonsoo Kim Cc: Herbert Xu Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Eric W. Biederman Cc: Kees Cook Signed-off-by: Andrew Morton --- arch/arm64/include/asm/cache.h | 17 ++++++++++++----- include/linux/slab.h | 12 ++++++++++++ mm/slab.c | 7 +++---- mm/slab_common.c | 3 +-- mm/slob.c | 16 +++++++++++----- 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index a074459f8f2f..7c2181c72116 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -6,6 +6,7 @@ #define __ASM_CACHE_H #include +#include #define CTR_L1IP_SHIFT 14 #define CTR_L1IP_MASK 3 @@ -49,16 +50,22 @@ */ #define ARCH_DMA_MINALIGN (128) +#ifndef __ASSEMBLY__ + +#include +#include + #ifdef CONFIG_KASAN_SW_TAGS #define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT) #elif defined(CONFIG_KASAN_HW_TAGS) -#define ARCH_SLAB_MINALIGN MTE_GRANULE_SIZE +static inline unsigned int arch_slab_minalign(void) +{ + return kasan_hw_tags_enabled() ? MTE_GRANULE_SIZE : + __alignof__(unsigned long long); +} +#define arch_slab_minalign() arch_slab_minalign() #endif -#ifndef __ASSEMBLY__ - -#include - #define ICACHEF_ALIASING 0 #define ICACHEF_VPIPT 1 extern unsigned long __icache_flags; diff --git a/include/linux/slab.h b/include/linux/slab.h index 373b3ef99f4e..3d2f2a3ca17e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -209,6 +209,18 @@ void kmem_dump_obj(void *object); #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * Arches can define this function if they want to decide the minimum slab + * alignment at runtime. The value returned by the function must be a power + * of two and >= ARCH_SLAB_MINALIGN. + */ +#ifndef arch_slab_minalign +static inline unsigned int arch_slab_minalign(void) +{ + return ARCH_SLAB_MINALIGN; +} +#endif + /* * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN diff --git a/mm/slab.c b/mm/slab.c index 0edb474edef1..8fb259511305 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3009,10 +3009,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); - if (ARCH_SLAB_MINALIGN && - ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n", - objp, (int)ARCH_SLAB_MINALIGN); + if ((unsigned long)objp & (arch_slab_minalign() - 1)) { + pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp, + arch_slab_minalign()); } return objp; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 2b3206a2c3b5..987e6917d850 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -154,8 +154,7 @@ static unsigned int calculate_alignment(slab_flags_t flags, align = max(align, ralign); } - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; + align = max(align, arch_slab_minalign()); return ALIGN(align, sizeof(void *)); } diff --git a/mm/slob.c b/mm/slob.c index 40ea6e2d4ccd..f47811f09aca 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -478,9 +478,11 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int minalign; void *ret; + minalign = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, + arch_slab_minalign()); gfp &= gfp_allowed_mask; might_alloc(gfp); @@ -493,7 +495,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) * kmalloc()'d objects. */ if (is_power_of_2(size)) - align = max(minalign, (int) size); + align = max_t(unsigned int, minalign, size); if (!size) return ZERO_SIZE_PTR; @@ -555,8 +557,11 @@ void kfree(const void *block) sp = virt_to_folio(block); if (folio_test_slab(sp)) { - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int align = max_t(unsigned int, + ARCH_KMALLOC_MINALIGN, + arch_slab_minalign()); unsigned int *m = (unsigned int *)(block - align); + slob_free(m, *m + align); } else { unsigned int order = folio_order(sp); @@ -573,7 +578,7 @@ EXPORT_SYMBOL(kfree); size_t __ksize(const void *block) { struct folio *folio; - int align; + unsigned int align; unsigned int *m; BUG_ON(!block); @@ -584,7 +589,8 @@ size_t __ksize(const void *block) if (unlikely(!folio_test_slab(folio))) return folio_size(folio); - align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + align = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, + arch_slab_minalign()); m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -- cgit v1.2.3 From 5b4494896cb379b0304ba8320589f2ffd08a7b31 Mon Sep 17 00:00:00 2001 From: Florian Rommel Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mmap locking API: fix missed mmap_sem references in comments Commit c1e8d7c6a7a6 ("mmap locking API: convert mmap_sem comments") missed replacing some references of mmap_sem by mmap_lock due to misspelling (mm_sem instead of mmap_sem). Link: https://lkml.kernel.org/r/20220503113333.214124-1-mail@florommel.de Signed-off-by: Florian Rommel Signed-off-by: Andrew Morton --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 2 +- drivers/gpu/drm/ttm/ttm_bo_vm.c | 2 +- mm/mmap.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 65552bb7d2f2..e83cb1c09610 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -184,7 +184,7 @@ static int kgd_hqd_load(struct amdgpu_device *adev, void *mqd, /* read_user_ptr may take the mm->mmap_lock. * release srbm_mutex to avoid circular dependency between - * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. + * srbm_mutex->mmap_lock->reservation_ww_class_mutex->srbm_mutex. */ release_queue(adev); valid_wptr = read_user_wptr(mm, wptr, wptr_val); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index 9dc5f2a0cc07..870f352837fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -208,7 +208,7 @@ static int kgd_hqd_load(struct amdgpu_device *adev, void *mqd, /* read_user_ptr may take the mm->mmap_lock. * release srbm_mutex to avoid circular dependency between - * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. + * srbm_mutex->mmap_lock->reservation_ww_class_mutex->srbm_mutex. */ release_queue(adev); valid_wptr = read_user_wptr(mm, wptr, wptr_val); diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 08ba083a80d2..839205661724 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -110,7 +110,7 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo, * @bo: The buffer object * @vmf: The fault structure handed to the callback * - * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped + * vm callbacks like fault() and *_mkwrite() allow for the mmap_lock to be dropped * during long waits, and after the wait the callback will be restarted. This * is to allow other threads using the same virtual memory space concurrent * access to map(), unmap() completely unrelated buffer objects. TTM buffer diff --git a/mm/mmap.c b/mm/mmap.c index d7e120ad5825..7f7d982721b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1285,7 +1285,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * - * NOTE! This runs with mm_sem held for reading, so it is possible that + * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton @@ -1299,7 +1299,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just - * a read lock on the mm_sem. + * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { -- cgit v1.2.3 From b304c6f0d39d927a87e72a8ac6c89b96ac25f355 Mon Sep 17 00:00:00 2001 From: Hongchen Zhang Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mm/swapops: make is_pmd_migration_entry more strict A pmd migration entry should first be a swap pmd,so use is_swap_pmd(pmd) instead of !pmd_present(pmd). On the other hand, some architecture (MIPS for example) may misjudge a pmd_none entry as a pmd migration entry. Link: https://lkml.kernel.org/r/1651131333-6386-1-git-send-email-zhanghongchen@loongson.cn Signed-off-by: Hongchen Zhang Acked-by: Peter Xu Cc: Alistair Popple Cc: Ralph Campbell Cc: Naoya Horiguchi Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/swapops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d1b728904d4e..82265b055a0d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -331,7 +331,7 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) static inline int is_pmd_migration_entry(pmd_t pmd) { - return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); + return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, -- cgit v1.2.3 From dd0623020e0d068d5eba0c37d5ae1277800b49c4 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Mon, 9 May 2022 18:20:54 -0700 Subject: mm/rmap: Fix typos in comments Fix spelling/grammar mistakes in comments. Link: https://lkml.kernel.org/r/20220428061522.666-1-adrianhuang0701@gmail.com Signed-off-by: Adrian Huang Signed-off-by: Andrew Morton --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index f73a62c5b180..72464378e1a6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -298,7 +298,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) * Reuse existing anon_vma if its degree lower than two, * that means it has no vma and only one anon_vma child. * - * Do not chose parent anon_vma, otherwise first child + * Do not choose parent anon_vma, otherwise first child * will always reuse it. Root anon_vma is never reused: * it has self-parent reference and at least one child. */ @@ -2249,7 +2249,7 @@ static bool folio_make_device_exclusive(struct folio *folio, /** * make_device_exclusive_range() - Mark a range for exclusive use by a device - * @mm: mm_struct of assoicated target process + * @mm: mm_struct of associated target process * @start: start of the region to mark for exclusive device access * @end: end address of region * @pages: returns the pages which were successfully marked for exclusive access -- cgit v1.2.3 From 9994715333515e82865e533250e488496b9742f4 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Mon, 9 May 2022 18:20:54 -0700 Subject: selftest/vm: test that mremap fails on non-existent vma Add a regression test that validates that mremap fails for vma's that don't exist. Link: https://lkml.kernel.org/r/20220427224439.23828-3-dossche.niels@gmail.com Signed-off-by: Niels Dossche Cc: Mina Almasry Cc: Mike Kravetz Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugepage-mremap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 1d689084a54b..585978f181ed 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -178,6 +178,12 @@ int main(int argc, char *argv[]) munmap(addr, length); + addr = mremap(addr, length, length, 0); + if (addr != MAP_FAILED) { + printf("mremap: Expected failure, but call succeeded\n"); + exit(1); + } + close(fd); unlink(argv[argc-1]); -- cgit v1.2.3 From 6e74d2bf5a265113ca54a8323783d2f3fdde96b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:54 -0700 Subject: mm/damon/core: add a new callback for watermarks checks Patch series "mm/damon: Support online tuning". Effects of DAMON and DAMON-based Operation Schemes highly depends on the configurations. Wrong configurations could even result in unexpected efficiency degradations. For finding a best configuration, repeating incremental configuration changes and results measurements, in other words, online tuning, could be helpful. Nevertheless, DAMON kernel API supports only restrictive online tuning. Worse yet, the sysfs-based DAMON user interface doesn't support online tuning at all. DAMON_RECLAIM also doesn't support online tuning. This patchset makes the DAMON kernel API, DAMON sysfs interface, and DAMON_RECLAIM supports online tuning. Sequence of patches ------------------- First two patches enhance DAMON online tuning for kernel API users. Specifically, patch 1 let kernel API users to be able to do DAMON online tuning without a restriction, and patch 2 makes error handling easier. Following seven patches (patches 3-9) refactor code for better readability and easier reuse of code fragments that will be useful for online tuning support. Patch 10 introduces DAMON callback based user request handling structure for DAMON sysfs interface, and patch 11 enables DAMON online tuning via DAMON sysfs interface. Documentation patch (patch 12) for usage of it follows. Patch 13 enables online tuning of DAMON_RECLAIM and finally patch 14 documents the DAMON_RECLAIM online tuning usage. This patch (of 14): For updating input parameters for running DAMON contexts, DAMON kernel API users can use the contexts' callbacks, as it is the safe place for context internal data accesses. When the context has DAMON-based operation schemes and all schemes are deactivated due to their watermarks, however, DAMON does nothing but only watermarks checks. As a result, no callbacks will be called back, and therefore the kernel API users cannot update the input parameters including monitoring attributes, DAMON-based operation schemes, and watermarks. To let users easily update such DAMON input parameters in such a case, this commit adds a new callback, 'after_wmarks_check()'. It will be called after each watermarks check. Users can do the online input parameters update in the callback even under the schemes deactivated case. Link: https://lkml.kernel.org/r/20220429160606.127307-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +++++++ mm/damon/core.c | 8 +++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 09a5d0d02c00..6cb5ab5d8e9d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -343,6 +343,7 @@ struct damon_operations { * struct damon_callback - Monitoring events notification callbacks. * * @before_start: Called before starting the monitoring. + * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. * @before_terminate: Called before terminating the monitoring. @@ -353,6 +354,11 @@ struct damon_operations { * respectively. Therefore, those are good places for installing and cleaning * @private. * + * The monitoring thread calls @after_wmarks_check after each DAMON-based + * operation schemes' watermarks check. If users need to make changes to the + * attributes of the monitoring context while it's deactivated due to the + * watermarks, this is the good place to do. + * * The monitoring thread calls @after_sampling and @after_aggregation for each * of the sampling intervals and aggregation intervals, respectively. * Therefore, users can safely access the monitoring results without additional @@ -365,6 +371,7 @@ struct damon_callback { void *private; int (*before_start)(struct damon_ctx *context); + int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); void (*before_terminate)(struct damon_ctx *context); diff --git a/mm/damon/core.c b/mm/damon/core.c index 997cf7b17779..44fe7e452a1e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1051,6 +1051,10 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) return 0; kdamond_usleep(min_wait_time); + + if (ctx->callback.after_wmarks_check && + ctx->callback.after_wmarks_check(ctx)) + break; } return -EBUSY; } @@ -1077,8 +1081,10 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); while (!kdamond_need_stop(ctx) && !done) { - if (kdamond_wait_activation(ctx)) + if (kdamond_wait_activation(ctx)) { + done = true; continue; + } if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); -- cgit v1.2.3 From abacd635fa7b7a39858bb4182eef33ffa628b12c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:54 -0700 Subject: mm/damon/core: finish kdamond as soon as any callback returns an error When 'after_sampling()' or 'after_aggregation()' DAMON callbacks return an error, kdamond continues the remaining loop once. It makes no much sense to run the remaining part while something wrong already happened. The context might be corrupted or having invalid data. This commit therefore makes kdamond skips the remaining works and immediately finish in the cases. Link: https://lkml.kernel.org/r/20220429160606.127307-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 44fe7e452a1e..b6daaff37bec 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1089,8 +1089,10 @@ static int kdamond_fn(void *data) if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && - ctx->callback.after_sampling(ctx)) + ctx->callback.after_sampling(ctx)) { done = true; + continue; + } kdamond_usleep(ctx->sample_interval); @@ -1102,8 +1104,10 @@ static int kdamond_fn(void *data) max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) + ctx->callback.after_aggregation(ctx)) { done = true; + continue; + } kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); -- cgit v1.2.3 From af3f18f6ad3f3c8ae70955cf288f467ea2bc83ed Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:54 -0700 Subject: mm/damon/vaddr: generalize damon_va_apply_three_regions() 'damon_va_apply_three_regions()' is for adjusting address ranges to fit in three discontiguous ranges. The function can be generalized for arbitrary number of discontiguous ranges and reused for future usage, such as arbitrary online regions update. For such future usage, this commit introduces a generalized version of the function called 'damon_set_regions()'. Link: https://lkml.kernel.org/r/20220429160606.127307-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 66 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 5ba82ab4943b..26e9ad80f9ea 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -297,59 +297,77 @@ static bool damon_intersect(struct damon_region *r, } /* - * Update damon regions for the three big regions of the given target + * damon_set_regions() - Set regions of a target for given address ranges. + * @t: the given target. + * @ranges: array of new monitoring target ranges. + * @nr_ranges: length of @ranges. * - * t the given target - * bregions the three big regions of the target + * This function adds new regions to, or modify existing regions of a + * monitoring target to fit in specific ranges. + * + * Return: 0 if success, or negative error code otherwise. */ -static void damon_va_apply_three_regions(struct damon_target *t, - struct damon_addr_range bregions[3]) +static int damon_set_regions(struct damon_target *t, + struct damon_addr_range *ranges, unsigned int nr_ranges) { struct damon_region *r, *next; unsigned int i; - /* Remove regions which are not in the three big regions now */ + /* Remove regions which are not in the new ranges */ damon_for_each_region_safe(r, next, t) { - for (i = 0; i < 3; i++) { - if (damon_intersect(r, &bregions[i])) + for (i = 0; i < nr_ranges; i++) { + if (damon_intersect(r, &ranges[i])) break; } - if (i == 3) + if (i == nr_ranges) damon_destroy_region(r, t); } - /* Adjust intersecting regions to fit with the three big regions */ - for (i = 0; i < 3; i++) { - struct damon_region *first = NULL, *last; - struct damon_region *newr; - struct damon_addr_range *br; + /* Add new regions or resize existing regions to fit in the ranges */ + for (i = 0; i < nr_ranges; i++) { + struct damon_region *first = NULL, *last, *newr; + struct damon_addr_range *range; - br = &bregions[i]; - /* Get the first and last regions which intersects with br */ + range = &ranges[i]; + /* Get the first/last regions intersecting with the range */ damon_for_each_region(r, t) { - if (damon_intersect(r, br)) { + if (damon_intersect(r, range)) { if (!first) first = r; last = r; } - if (r->ar.start >= br->end) + if (r->ar.start >= range->end) break; } if (!first) { - /* no damon_region intersects with this big region */ + /* no region intersects with this range */ newr = damon_new_region( - ALIGN_DOWN(br->start, + ALIGN_DOWN(range->start, DAMON_MIN_REGION), - ALIGN(br->end, DAMON_MIN_REGION)); + ALIGN(range->end, DAMON_MIN_REGION)); if (!newr) - continue; + return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { - first->ar.start = ALIGN_DOWN(br->start, + /* resize intersecting regions to fit in this range */ + first->ar.start = ALIGN_DOWN(range->start, DAMON_MIN_REGION); - last->ar.end = ALIGN(br->end, DAMON_MIN_REGION); + last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); } } + return 0; +} + +/* + * Update damon regions for the three big regions of the given target + * + * t the given target + * bregions the three big regions of the target + */ +static void damon_va_apply_three_regions(struct damon_target *t, + struct damon_addr_range bregions[3]) +{ + damon_set_regions(t, bregions, 3); } /* -- cgit v1.2.3 From d0723bc04185b15ed96ade0a0bf9277ef00008db Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/vaddr: move 'damon_set_regions()' to core This commit moves 'damon_set_regions()' from vaddr to core, as it is aimed to be used by not only 'vaddr' but also other parts of DAMON. Link: https://lkml.kernel.org/r/20220429160606.127307-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/damon/vaddr.c | 73 --------------------------------------------------- 3 files changed, 75 insertions(+), 73 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 6cb5ab5d8e9d..d1e6ee28a2ff 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -494,6 +494,8 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); +int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, + unsigned int nr_ranges); struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, diff --git a/mm/damon/core.c b/mm/damon/core.c index b6daaff37bec..7d25dc582fe3 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -157,6 +157,79 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * damon_set_regions() - Set regions of a target for given address ranges. + * @t: the given target. + * @ranges: array of new monitoring target ranges. + * @nr_ranges: length of @ranges. + * + * This function adds new regions to, or modify existing regions of a + * monitoring target to fit in specific ranges. + * + * Return: 0 if success, or negative error code otherwise. + */ +int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, + unsigned int nr_ranges) +{ + struct damon_region *r, *next; + unsigned int i; + + /* Remove regions which are not in the new ranges */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < nr_ranges; i++) { + if (damon_intersect(r, &ranges[i])) + break; + } + if (i == nr_ranges) + damon_destroy_region(r, t); + } + + /* Add new regions or resize existing regions to fit in the ranges */ + for (i = 0; i < nr_ranges; i++) { + struct damon_region *first = NULL, *last, *newr; + struct damon_addr_range *range; + + range = &ranges[i]; + /* Get the first/last regions intersecting with the range */ + damon_for_each_region(r, t) { + if (damon_intersect(r, range)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= range->end) + break; + } + if (!first) { + /* no region intersects with this range */ + newr = damon_new_region( + ALIGN_DOWN(range->start, + DAMON_MIN_REGION), + ALIGN(range->end, DAMON_MIN_REGION)); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + /* resize intersecting regions to fit in this range */ + first->ar.start = ALIGN_DOWN(range->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + } + } + return 0; +} + struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 26e9ad80f9ea..fcb44210204a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -285,79 +285,6 @@ static void damon_va_init(struct damon_ctx *ctx) * Functions for the dynamic monitoring target regions update */ -/* - * Check whether a region is intersecting an address range - * - * Returns true if it is. - */ -static bool damon_intersect(struct damon_region *r, - struct damon_addr_range *re) -{ - return !(r->ar.end <= re->start || re->end <= r->ar.start); -} - -/* - * damon_set_regions() - Set regions of a target for given address ranges. - * @t: the given target. - * @ranges: array of new monitoring target ranges. - * @nr_ranges: length of @ranges. - * - * This function adds new regions to, or modify existing regions of a - * monitoring target to fit in specific ranges. - * - * Return: 0 if success, or negative error code otherwise. - */ -static int damon_set_regions(struct damon_target *t, - struct damon_addr_range *ranges, unsigned int nr_ranges) -{ - struct damon_region *r, *next; - unsigned int i; - - /* Remove regions which are not in the new ranges */ - damon_for_each_region_safe(r, next, t) { - for (i = 0; i < nr_ranges; i++) { - if (damon_intersect(r, &ranges[i])) - break; - } - if (i == nr_ranges) - damon_destroy_region(r, t); - } - - /* Add new regions or resize existing regions to fit in the ranges */ - for (i = 0; i < nr_ranges; i++) { - struct damon_region *first = NULL, *last, *newr; - struct damon_addr_range *range; - - range = &ranges[i]; - /* Get the first/last regions intersecting with the range */ - damon_for_each_region(r, t) { - if (damon_intersect(r, range)) { - if (!first) - first = r; - last = r; - } - if (r->ar.start >= range->end) - break; - } - if (!first) { - /* no region intersects with this range */ - newr = damon_new_region( - ALIGN_DOWN(range->start, - DAMON_MIN_REGION), - ALIGN(range->end, DAMON_MIN_REGION)); - if (!newr) - return -ENOMEM; - damon_insert_region(newr, damon_prev_region(r), r, t); - } else { - /* resize intersecting regions to fit in this range */ - first->ar.start = ALIGN_DOWN(range->start, - DAMON_MIN_REGION); - last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); - } - } - return 0; -} - /* * Update damon regions for the three big regions of the given target * -- cgit v1.2.3 From dae0087aeff4ea7b5054d9612bba1d5f7c3046ec Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/vaddr: remove damon_va_apply_three_regions() 'damon_va_apply_three_regions()' is just a wrapper of its general version, 'damon_set_regions()'. This commit replaces the wrapper calls to directly call the general version. Link: https://lkml.kernel.org/r/20220429160606.127307-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 6 +++--- mm/damon/vaddr.c | 18 +----------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 5431da4fe9d4..d4f55f349100 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -109,7 +109,7 @@ static struct damon_region *__nth_region_of(struct damon_target *t, int idx) } /* - * Test 'damon_va_apply_three_regions()' + * Test 'damon_set_regions()' * * test kunit object * regions an array containing start/end addresses of current @@ -124,7 +124,7 @@ static struct damon_region *__nth_region_of(struct damon_target *t, int idx) * the change, DAMON periodically reads the mappings, simplifies it to the * three regions, and updates the monitoring target regions to fit in the three * regions. The update of current target regions is the role of - * 'damon_va_apply_three_regions()'. + * 'damon_set_regions()'. * * This test passes the given target regions and the new three regions that * need to be applied to the function and check whether it updates the regions @@ -145,7 +145,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_va_apply_three_regions(t, three_regions); + damon_set_regions(t, three_regions, 3); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index fcb44210204a..77f326323e74 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -281,22 +281,6 @@ static void damon_va_init(struct damon_ctx *ctx) } } -/* - * Functions for the dynamic monitoring target regions update - */ - -/* - * Update damon regions for the three big regions of the given target - * - * t the given target - * bregions the three big regions of the target - */ -static void damon_va_apply_three_regions(struct damon_target *t, - struct damon_addr_range bregions[3]) -{ - damon_set_regions(t, bregions, 3); -} - /* * Update regions for current memory mappings */ @@ -308,7 +292,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_va_apply_three_regions(t, three_regions); + damon_set_regions(t, three_regions, 3); } } -- cgit v1.2.3 From 0a890a9faaad3bfb7fa4087c4d35e2ea3527013b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/sysfs: prohibit multiple physical address space monitoring targets Having multiple targets for physical address space monitoring makes no sense. This commit prohibits such a ridiculous DAMON context setup my making the DAMON context build function to check and return an error for the case. Link: https://lkml.kernel.org/r/20220429160606.127307-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f753bb405101..3d2da791ebd8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2125,6 +2125,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, { int i, err; + /* Multiple physical address space monitoring targets makes no sense */ + if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1) + return -EINVAL; + for (i = 0; i < sysfs_targets->nr; i++) { struct damon_sysfs_target *sys_target = sysfs_targets->targets_arr[i]; -- cgit v1.2.3 From 74bd8b7d2f8e7014dc34d6c5f51c629afdf822cb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/sysfs: move targets setup code to a separated function This commit separates DAMON sysfs interface's monitoring context targets setup code to a new function for better readability. Link: https://lkml.kernel.org/r/20220429160606.127307-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 3d2da791ebd8..a9d4f9389903 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2120,6 +2120,31 @@ static int damon_sysfs_set_regions(struct damon_target *t, return 0; } +static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, + struct damon_ctx *ctx) +{ + struct damon_target *t = damon_new_target(); + int err = -EINVAL; + + if (!t) + return -ENOMEM; + if (ctx->ops.id == DAMON_OPS_VADDR || + ctx->ops.id == DAMON_OPS_FVADDR) { + t->pid = find_get_pid(sys_target->pid); + if (!t->pid) + goto destroy_targets_out; + } + damon_add_target(ctx, t); + err = damon_sysfs_set_regions(t, sys_target->regions); + if (err) + goto destroy_targets_out; + return 0; + +destroy_targets_out: + damon_sysfs_destroy_targets(ctx); + return err; +} + static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { @@ -2130,28 +2155,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return -EINVAL; for (i = 0; i < sysfs_targets->nr; i++) { - struct damon_sysfs_target *sys_target = - sysfs_targets->targets_arr[i]; - struct damon_target *t = damon_new_target(); - - if (!t) { - damon_sysfs_destroy_targets(ctx); - return -ENOMEM; - } - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) { - t->pid = find_get_pid(sys_target->pid); - if (!t->pid) { - damon_sysfs_destroy_targets(ctx); - return -EINVAL; - } - } - damon_add_target(ctx, t); - err = damon_sysfs_set_regions(t, sys_target->regions); - if (err) { - damon_sysfs_destroy_targets(ctx); + err = damon_sysfs_add_target( + sysfs_targets->targets_arr[i], ctx); + if (err) return err; - } } return 0; } -- cgit v1.2.3 From 97d482f4592fde2322c319f07bc54f3a0d37861c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/sysfs: reuse damon_set_regions() for regions setting 'damon_set_regions()' is general enough so that it can also be used for only creating regions. This commit makes DAMON sysfs interface to reuse the function rather keeping two implementations for a same purpose. Link: https://lkml.kernel.org/r/20220429160606.127307-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a9d4f9389903..b85efe2bad78 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2096,28 +2096,31 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions) { - int i; + struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, + sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); + int i, err = -EINVAL; + if (!ranges) + return -ENOMEM; for (i = 0; i < sysfs_regions->nr; i++) { struct damon_sysfs_region *sys_region = sysfs_regions->regions_arr[i]; - struct damon_region *prev, *r; if (sys_region->start > sys_region->end) - return -EINVAL; - r = damon_new_region(sys_region->start, sys_region->end); - if (!r) - return -ENOMEM; - damon_add_region(r, t); - if (damon_nr_regions(t) > 1) { - prev = damon_prev_region(r); - if (prev->ar.end > r->ar.start) { - damon_destroy_region(r, t); - return -EINVAL; - } - } + goto out; + + ranges[i].start = sys_region->start; + ranges[i].end = sys_region->end; + if (i == 0) + continue; + if (ranges[i - 1].end > ranges[i].start) + goto out; } - return 0; + err = damon_set_regions(t, ranges, sysfs_regions->nr); +out: + kfree(ranges); + return err; + } static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, -- cgit v1.2.3 From 3cbab4ca1ea8752912d8719d72059869b4c18045 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/sysfs: use enum for 'state' input handling DAMON sysfs 'state' file handling code is using string literals in both 'state_show()' and 'state_store()'. This makes the code error prone and inflexible for future extensions. To improve the situation, this commit defines possible input strings and 'enum' for identifying each input keyword only once, and refactors the code to reuse those. Link: https://lkml.kernel.org/r/20220429160606.127307-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index b85efe2bad78..446374fb112b 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2053,6 +2053,32 @@ static bool damon_sysfs_ctx_running(struct damon_ctx *ctx) return running; } +/* + * enum damon_sysfs_cmd - Commands for a specific kdamond. + */ +enum damon_sysfs_cmd { + /* @DAMON_SYSFS_CMD_ON: Turn the kdamond on. */ + DAMON_SYSFS_CMD_ON, + /* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */ + DAMON_SYSFS_CMD_OFF, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs + * files. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. + */ + NR_DAMON_SYSFS_CMDS, +}; + +/* Should match with enum damon_sysfs_cmd */ +static const char * const damon_sysfs_cmd_strs[] = { + "on", + "off", + "update_schemes_stats", +}; + static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2066,7 +2092,9 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, else running = damon_sysfs_ctx_running(ctx); - return sysfs_emit(buf, "%s\n", running ? "on" : "off"); + return sysfs_emit(buf, "%s\n", running ? + damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : + damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); } static int damon_sysfs_set_attrs(struct damon_ctx *ctx, @@ -2325,23 +2353,47 @@ static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond) return 0; } +/* + * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. + * @cmd: The command to handle. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * This function handles a DAMON sysfs command for a kdamond. + * + * Return: 0 on success, negative error code otherwise. + */ +static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, + struct damon_sysfs_kdamond *kdamond) +{ + switch (cmd) { + case DAMON_SYSFS_CMD_ON: + return damon_sysfs_turn_damon_on(kdamond); + case DAMON_SYSFS_CMD_OFF: + return damon_sysfs_turn_damon_off(kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + return damon_sysfs_update_schemes_stats(kdamond); + default: + break; + } + return -EINVAL; +} + static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - ssize_t ret; + enum damon_sysfs_cmd cmd; + ssize_t ret = -EINVAL; if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; - if (sysfs_streq(buf, "on")) - ret = damon_sysfs_turn_damon_on(kdamond); - else if (sysfs_streq(buf, "off")) - ret = damon_sysfs_turn_damon_off(kdamond); - else if (sysfs_streq(buf, "update_schemes_stats")) - ret = damon_sysfs_update_schemes_stats(kdamond); - else - ret = -EINVAL; + for (cmd = 0; cmd < NR_DAMON_SYSFS_CMDS; cmd++) { + if (sysfs_streq(buf, damon_sysfs_cmd_strs[cmd])) { + ret = damon_sysfs_handle_cmd(cmd, kdamond); + break; + } + } mutex_unlock(&damon_sysfs_lock); if (!ret) ret = count; -- cgit v1.2.3 From 01538719c098f51ac0742ef610d127ba3e3d1e03 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:56 -0700 Subject: mm/damon/sysfs: update schemes stat in the kdamond context Only '->kdamond' and '->kdamond_stop' are protected by 'kdamond_lock' of 'struct damon_ctx'. All other DAMON context internal data items are recommended to be accessed in DAMON callbacks, or under some additional synchronizations. But, DAMON sysfs is accessing the schemes stat under 'kdamond_lock'. It makes no big issue as the read values are not used anywhere inside kernel, but would better to be fixed. This commit moves the reads to DAMON callback context, as supposed to be used for the purpose. Link: https://lkml.kernel.org/r/20220429160606.127307-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 135 insertions(+), 26 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 446374fb112b..f181f1d2e013 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2079,6 +2079,25 @@ static const char * const damon_sysfs_cmd_strs[] = { "update_schemes_stats", }; +/* + * struct damon_sysfs_cmd_request - A request to the DAMON callback. + * @cmd: The command that needs to be handled by the callback. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This structure represents a sysfs command request that need to access some + * DAMON context-internal data. Because DAMON context-internal data can be + * safely accessed from DAMON callbacks without additional synchronization, the + * request will be handled by the DAMON callback. None-``NULL`` @kdamond means + * the request is valid. + */ +struct damon_sysfs_cmd_request { + enum damon_sysfs_cmd cmd; + struct damon_sysfs_kdamond *kdamond; +}; + +/* Current DAMON callback request. Protected by damon_sysfs_lock. */ +static struct damon_sysfs_cmd_request damon_sysfs_cmd_request; + static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2258,6 +2277,70 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) mutex_unlock(&ctx->kdamond_lock); } +/* + * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes stats of specific kdamond and update the + * related values for sysfs files. This function should be called from DAMON + * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON + * contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + struct damon_sysfs_schemes *sysfs_schemes; + struct damos *scheme; + int schemes_idx = 0; + + if (!ctx) + return -EINVAL; + sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } + return 0; +} + +/* + * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. + * @c: The DAMON context of the callback. + * + * This function is periodically called back from the kdamond thread for @c. + * Then, it checks if there is a waiting DAMON sysfs request and handles it. + */ +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) +{ + struct damon_sysfs_kdamond *kdamond; + int err = 0; + + /* avoid deadlock due to concurrent state_store('off') */ + if (!mutex_trylock(&damon_sysfs_lock)) + return 0; + kdamond = damon_sysfs_cmd_request.kdamond; + if (!kdamond || kdamond->damon_ctx != c) + goto out; + switch (damon_sysfs_cmd_request.cmd) { + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + err = damon_sysfs_upd_schemes_stats(kdamond); + break; + default: + break; + } + /* Mark the request as invalid now. */ + damon_sysfs_cmd_request.kdamond = NULL; +out: + mutex_unlock(&damon_sysfs_lock); + return err; +} + static struct damon_ctx *damon_sysfs_build_ctx( struct damon_sysfs_context *sys_ctx) { @@ -2280,6 +2363,8 @@ static struct damon_ctx *damon_sysfs_build_ctx( if (err) goto out; + ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; + ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; @@ -2296,6 +2381,8 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) if (kdamond->damon_ctx && damon_sysfs_ctx_running(kdamond->damon_ctx)) return -EBUSY; + if (damon_sysfs_cmd_request.kdamond == kdamond) + return -EBUSY; /* TODO: support multiple contexts per kdamond */ if (kdamond->contexts->nr != 1) return -EINVAL; @@ -2328,29 +2415,11 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) */ } -static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static inline bool damon_sysfs_kdamond_running( + struct damon_sysfs_kdamond *kdamond) { - struct damon_ctx *ctx = kdamond->damon_ctx; - struct damos *scheme; - int schemes_idx = 0; - - if (!ctx) - return -EINVAL; - mutex_lock(&ctx->kdamond_lock); - damon_for_each_scheme(scheme, ctx) { - struct damon_sysfs_schemes *sysfs_schemes; - struct damon_sysfs_stats *sysfs_stats; - - sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; - sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; - sysfs_stats->nr_tried = scheme->stat.nr_tried; - sysfs_stats->sz_tried = scheme->stat.sz_tried; - sysfs_stats->nr_applied = scheme->stat.nr_applied; - sysfs_stats->sz_applied = scheme->stat.sz_applied; - sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; - } - mutex_unlock(&ctx->kdamond_lock); - return 0; + return kdamond->damon_ctx && + damon_sysfs_ctx_running(kdamond->damon_ctx); } /* @@ -2358,24 +2427,58 @@ static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond) * @cmd: The command to handle. * @kdamond: The kobject wrapper for the associated kdamond. * - * This function handles a DAMON sysfs command for a kdamond. + * This function handles a DAMON sysfs command for a kdamond. For commands + * that need to access running DAMON context-internal data, it requests + * handling of the command to the DAMON callback + * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled, + * or the context is completed. * * Return: 0 on success, negative error code otherwise. */ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, struct damon_sysfs_kdamond *kdamond) { + bool need_wait = true; + + /* Handle commands that doesn't access DAMON context-internal data */ switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); case DAMON_SYSFS_CMD_OFF: return damon_sysfs_turn_damon_off(kdamond); - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: - return damon_sysfs_update_schemes_stats(kdamond); default: break; } - return -EINVAL; + + /* Pass the command to DAMON callback for safe DAMON context access */ + if (damon_sysfs_cmd_request.kdamond) + return -EBUSY; + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + damon_sysfs_cmd_request.cmd = cmd; + damon_sysfs_cmd_request.kdamond = kdamond; + + /* + * wait until damon_sysfs_cmd_request_callback() handles the request + * from kdamond context + */ + mutex_unlock(&damon_sysfs_lock); + while (need_wait) { + schedule_timeout_idle(msecs_to_jiffies(100)); + if (!mutex_trylock(&damon_sysfs_lock)) + continue; + if (!damon_sysfs_cmd_request.kdamond) { + /* damon_sysfs_cmd_request_callback() handled */ + need_wait = false; + } else if (!damon_sysfs_kdamond_running(kdamond)) { + /* kdamond has already finished */ + need_wait = false; + damon_sysfs_cmd_request.kdamond = NULL; + } + mutex_unlock(&damon_sysfs_lock); + } + mutex_lock(&damon_sysfs_lock); + return 0; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -2512,6 +2615,12 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr)) return -EBUSY; + for (i = 0; i < kdamonds->nr; i++) { + if (damon_sysfs_cmd_request.kdamond == + kdamonds->kdamonds_arr[i]) + return -EBUSY; + } + damon_sysfs_kdamonds_rm_dirs(kdamonds); if (!nr_kdamonds) return 0; -- cgit v1.2.3 From da87878010e59869d4d27b3c01ecc8ec06ff4a20 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:56 -0700 Subject: mm/damon/sysfs: support online inputs update Currently, DAMON sysfs interface doesn't provide a way for adjusting DAMON input parameters while it is turned on. Therefore, users who want to reconfigure DAMON need to stop DAMON and restart. This means all the monitoring results that accumulated so far, which could be useful, should be flushed. This would be inefficient for many cases. For an example, let's suppose a sysadmin was running a DAMON-based Operation Scheme to find memory regions not accessed for more than 5 mins and page out the regions. If it turns out the 5 mins threshold was too long and therefore the sysadmin wants to reduce it to 4 mins, the sysadmin should turn off DAMON, restart it, and wait for at least 4 more minutes so that DAMON can find the cold memory regions, even though DAMON was knowing there are regions that not accessed for 4 mins at the time of shutdown. This commit makes DAMON sysfs interface to support online DAMON input parameters updates by adding a new input keyword for the 'state' DAMON sysfs file, 'commit'. Writing the keyword to the 'state' file while the corresponding kdamond is running makes the kdamond to read the sysfs file values again and update the DAMON context. Link: https://lkml.kernel.org/r/20220429160606.127307-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 9 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f181f1d2e013..09f9e8ca3d1f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2061,6 +2061,8 @@ enum damon_sysfs_cmd { DAMON_SYSFS_CMD_ON, /* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */ DAMON_SYSFS_CMD_OFF, + /* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */ + DAMON_SYSFS_CMD_COMMIT, /* * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs * files. @@ -2076,6 +2078,7 @@ enum damon_sysfs_cmd { static const char * const damon_sysfs_cmd_strs[] = { "on", "off", + "commit", "update_schemes_stats", }; @@ -2195,6 +2198,39 @@ destroy_targets_out: return err; } +/* + * Search a target in a context that corresponds to the sysfs target input. + * + * Return: pointer to the target if found, NULL if not found, or negative + * error code if the search failed. + */ +static struct damon_target *damon_sysfs_existing_target( + struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) +{ + struct pid *pid; + struct damon_target *t; + + if (ctx->ops.id == DAMON_OPS_PADDR) { + /* Up to only one target for paddr could exist */ + damon_for_each_target(t, ctx) + return t; + return NULL; + } + + /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */ + pid = find_get_pid(sys_target->pid); + if (!pid) + return ERR_PTR(-EINVAL); + damon_for_each_target(t, ctx) { + if (t->pid == pid) { + put_pid(pid); + return t; + } + } + put_pid(pid); + return NULL; +} + static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { @@ -2205,8 +2241,15 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return -EINVAL; for (i = 0; i < sysfs_targets->nr; i++) { - err = damon_sysfs_add_target( - sysfs_targets->targets_arr[i], ctx); + struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; + struct damon_target *t = damon_sysfs_existing_target(st, ctx); + + if (IS_ERR(t)) + return PTR_ERR(t); + if (!t) + err = damon_sysfs_add_target(st, ctx); + else + err = damon_sysfs_set_regions(t, st->regions); if (err) return err; } @@ -2309,6 +2352,48 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) return 0; } +static inline bool damon_sysfs_kdamond_running( + struct damon_sysfs_kdamond *kdamond) +{ + return kdamond->damon_ctx && + damon_sysfs_ctx_running(kdamond->damon_ctx); +} + +/* + * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * If the sysfs input is wrong, the kdamond will be terminated. + */ +static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + struct damon_sysfs_context *sys_ctx; + int err = 0; + + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + /* TODO: Support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + sys_ctx = kdamond->contexts->contexts_arr[0]; + + err = damon_select_ops(ctx, sys_ctx->ops_id); + if (err) + return err; + err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); + if (err) + return err; + err = damon_sysfs_set_targets(ctx, sys_ctx->targets); + if (err) + return err; + err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes); + if (err) + return err; + return err; +} + /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. * @c: The DAMON context of the callback. @@ -2331,6 +2416,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: err = damon_sysfs_upd_schemes_stats(kdamond); break; + case DAMON_SYSFS_CMD_COMMIT: + err = damon_sysfs_commit_input(kdamond); + break; default: break; } @@ -2415,13 +2503,6 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) */ } -static inline bool damon_sysfs_kdamond_running( - struct damon_sysfs_kdamond *kdamond) -{ - return kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx); -} - /* * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. * @cmd: The command to handle. -- cgit v1.2.3 From adc286e6bdd39e94f243cb109d73fb422368acd5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:56 -0700 Subject: Docs/{ABI,admin-guide}/damon: Update for 'state' sysfs file input keyword, 'commit' This commit documents the newly added 'state' sysfs file input keyword, 'commit', which allows online tuning of DAMON contexts. Link: https://lkml.kernel.org/r/20220429160606.127307-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 ++++--- Documentation/admin-guide/mm/damon/usage.rst | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index fab97ea22569..08b9df323560 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -23,9 +23,10 @@ Date: Mar 2022 Contact: SeongJae Park Description: Writing 'on' or 'off' to this file makes the kdamond starts or stops, respectively. Reading the file returns the keywords - based on the current status. Writing 'update_schemes_stats' to - the file updates contents of schemes stats files of the - kdamond. + based on the current status. Writing 'commit' to this file + makes the kdamond reads the user inputs in the sysfs files + except 'state' again. Writing 'update_schemes_stats' to the + file updates contents of schemes stats files of the kdamond. What: /sys/kernel/mm/damon/admin/kdamonds//pid Date: Mar 2022 diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 9c67311a79d8..1bb7b72414b2 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -121,10 +121,11 @@ In each kdamond directory, two files (``state`` and ``pid``) and one directory Reading ``state`` returns ``on`` if the kdamond is currently running, or ``off`` if it is not running. Writing ``on`` or ``off`` makes the kdamond be -in the state. Writing ``update_schemes_stats`` to ``state`` file updates the -contents of stats files for each DAMON-based operation scheme of the kdamond. -For details of the stats, please refer to :ref:`stats section -`. +in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the +user inputs in the sysfs files except ``state`` file again. Writing +``update_schemes_stats`` to ``state`` file updates the contents of stats files +for each DAMON-based operation scheme of the kdamond. For details of the +stats, please refer to :ref:`stats section `. If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread. -- cgit v1.2.3 From e035c280f6dfebf0cf7462a03d83294ca1b8c4e3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:56 -0700 Subject: mm/damon/reclaim: support online inputs update DAMON_RECLAIM reads the user input parameters only when it starts. To allow more efficient online tuning, this commit implements a new input parameter called 'commit_inputs'. Writing true to the parameter makes DAMON_RECLAIM reads the input parameters again. Link: https://lkml.kernel.org/r/20220429160606.127307-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 95 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 75cfd96a6060..f37c5d4b27fa 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -29,6 +29,18 @@ */ static bool enabled __read_mostly; +/* + * Make DAMON_RECLAIM reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_RECLAIM is running are not applied + * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values + * of parametrs except ``enabled`` again. Once the re-reading is done, this + * parameter is set as ``N``. If invalid parameters are found while the + * re-reading, DAMON_RECLAIM will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + /* * Time threshold for cold memory regions identification in microseconds. * @@ -289,57 +301,56 @@ static struct damos *damon_reclaim_new_scheme(void) return scheme; } -static int damon_reclaim_turn(bool on) +static int damon_reclaim_apply_parameters(void) { - struct damon_region *region; struct damos *scheme; - int err; - - if (!on) { - err = damon_stop(&ctx, 1); - if (!err) - kdamond_pid = -1; - return err; - } + struct damon_addr_range addr_range; + int err = 0; err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, min_nr_regions, max_nr_regions); if (err) return err; + /* Will be freed by next 'damon_set_schemes()' below */ + scheme = damon_reclaim_new_scheme(); + if (!scheme) + return -ENOMEM; + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + return err; + if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && !get_monitoring_region(&monitor_region_start, &monitor_region_end)) return -EINVAL; - /* DAMON will free this on its own when finish monitoring */ - region = damon_new_region(monitor_region_start, monitor_region_end); - if (!region) - return -ENOMEM; - damon_add_region(region, target); + addr_range.start = monitor_region_start; + addr_range.end = monitor_region_end; + return damon_set_regions(target, &addr_range, 1); +} - /* Will be freed by 'damon_set_schemes()' below */ - scheme = damon_reclaim_new_scheme(); - if (!scheme) { - err = -ENOMEM; - goto free_region_out; +static int damon_reclaim_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; } - err = damon_set_schemes(ctx, &scheme, 1); + + err = damon_reclaim_apply_parameters(); if (err) - goto free_scheme_out; + return err; err = damon_start(&ctx, 1, true); - if (!err) { - kdamond_pid = ctx->kdamond->pid; - return 0; - } - -free_scheme_out: - damon_destroy_scheme(scheme); -free_region_out: - damon_destroy_region(region, target); - return err; + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; } #define ENABLE_CHECK_INTERVAL_MS 1000 @@ -389,6 +400,7 @@ MODULE_PARM_DESC(enabled, static int damon_reclaim_after_aggregation(struct damon_ctx *c) { struct damos *s; + int err = 0; /* update the stats parameter */ damon_for_each_scheme(s, c) { @@ -398,7 +410,23 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) bytes_reclaimed_regions = s->stat.sz_applied; nr_quota_exceeds = s->stat.qt_exceeds; } - return 0; + + if (commit_inputs) { + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + } + return err; +} + +static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) +{ + int err = 0; + + if (commit_inputs) { + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + } + return err; } static int __init damon_reclaim_init(void) @@ -410,6 +438,7 @@ static int __init damon_reclaim_init(void) if (damon_select_ops(ctx, DAMON_OPS_PADDR)) return -EINVAL; + ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; target = damon_new_target(); -- cgit v1.2.3 From 81a84182c3430c8f5f7ccf9e95a10b99f727f727 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:56 -0700 Subject: Docs/admin-guide/mm/damon/reclaim: document 'commit_inputs' parameter This commit documents the new DAMON_RECLAIM parameter, 'commit_inputs' in its usage document. Link: https://lkml.kernel.org/r/20220429160606.127307-15-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/reclaim.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 0af51a9705b1..46306f1f34b1 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -66,6 +66,17 @@ Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do no real monitoring and reclamation due to the watermarks-based activation condition. Refer to below descriptions for the watermarks parameter for this. +commit_inputs +------------- + +Make DAMON_RECLAIM reads the input parameters again, except ``enabled``. + +Input parameters that updated while DAMON_RECLAIM is running are not applied +by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values +of parametrs except ``enabled`` again. Once the re-reading is done, this +parameter is set as ``N``. If invalid parameters are found while the +re-reading, DAMON_RECLAIM will be disabled. + min_age ------- -- cgit v1.2.3 From 8a87d6959f0d81108d95b0dbd3d7dc2cecea853d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 12 May 2022 20:22:51 -0700 Subject: mm/page_alloc: cache the result of node_dirty_ok() To spread dirty pages, nodes are checked whether they have reached the dirty limit using the expensive node_dirty_ok(). To reduce the frequency of calling node_dirty_ok(), the last node that hit the dirty limit can be cached. Instead of caching the node, caching both the node and its node_dirty_ok() status can reduce the number of calle to node_dirty_ok(). [akpm@linux-foundation.org: rename last_pgdat_dirty_limit to last_pgdat_dirty_ok] Link: https://lkml.kernel.org/r/20220430011032.64071-1-vvghjk1234@gmail.com Signed-off-by: Wonhyuk Yang Acked-by: Johannes Weiner Acked-by: Mel Gorman Cc: Donghyeok Kim Cc: JaeSang Yoo Cc: Jiyoup Kim Cc: Ohhoon Kwon Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e85a0dc22761..ddb0575c6b4a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4021,7 +4021,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z; struct zone *zone; - struct pglist_data *last_pgdat_dirty_limit = NULL; + struct pglist_data *last_pgdat = NULL; + bool last_pgdat_dirty_ok = false; bool no_fallback; retry: @@ -4060,13 +4061,13 @@ retry: * dirty-throttling and the flusher threads. */ if (ac->spread_dirty_pages) { - if (last_pgdat_dirty_limit == zone->zone_pgdat) - continue; + if (last_pgdat != zone->zone_pgdat) { + last_pgdat = zone->zone_pgdat; + last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat); + } - if (!node_dirty_ok(zone->zone_pgdat)) { - last_pgdat_dirty_limit = zone->zone_pgdat; + if (!last_pgdat_dirty_ok) continue; - } } if (no_fallback && nr_online_nodes > 1 && -- cgit v1.2.3 From 679d103319101800567c8bb7e341b5eee39f6685 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm: introduce PTE_MARKER swap entry Patch series "userfaultfd-wp: Support shmem and hugetlbfs", v8. Overview ======== Userfaultfd-wp anonymous support was merged two years ago. There're quite a few applications that started to leverage this capability either to take snapshots for user-app memory, or use it for full user controled swapping. This series tries to complete the feature for uffd-wp so as to cover all the RAM-based memory types. So far uffd-wp is the only missing piece of the rest features (uffd-missing & uffd-minor mode). One major reason to do so is that anonymous pages are sometimes not satisfying the need of applications, and there're growing users of either shmem and hugetlbfs for either sharing purpose (e.g., sharing guest mem between hypervisor process and device emulation process, shmem local live migration for upgrades), or for performance on tlb hits. All these mean that if a uffd-wp app wants to switch to any of the memory types, it'll stop working. I think it's worthwhile to have the kernel to cover all these aspects. This series chose to protect pages in pte level not page level. One major reason is safety. I have no idea how we could make it safe if any of the uffd-privileged app can wr-protect a page that any other application can use. It means this app can block any process potentially for any time it wants. The other reason is that it aligns very well with not only the anonymous uffd-wp solution, but also uffd as a whole. For example, userfaultfd is implemented fundamentally based on VMAs. We set flags to VMAs showing the status of uffd tracking. For another per-page based protection solution, it'll be crossing the fundation line on VMA-based, and it could simply be too far away already from what's called userfaultfd. PTE markers =========== The patchset is based on the idea called PTE markers. It was discussed in one of the mm alignment sessions, proposed starting from v6, and this is the 2nd version of it using PTE marker idea. PTE marker is a new type of swap entry that is ony applicable to file backed memories like shmem and hugetlbfs. It's used to persist some pte-level information even if the original present ptes in pgtable are zapped. Logically pte markers can store more than uffd-wp information, but so far only one bit is used for uffd-wp purpose. When the pte marker is installed with uffd-wp bit set, it means this pte is wr-protected by uffd. It solves the problem on e.g. file-backed memory mapped ptes got zapped due to any reason (e.g. thp split, or swapped out), we can still keep the wr-protect information in the ptes. Then when the page fault triggers again, we'll know this pte is wr-protected so we can treat the pte the same as a normal uffd wr-protected pte. The extra information is encoded into the swap entry, or swp_offset to be explicit, with the swp_type being PTE_MARKER. So far uffd-wp only uses one bit out of the swap entry, the rest bits of swp_offset are still reserved for other purposes. There're two configs to enable/disable PTE markers: CONFIG_PTE_MARKER CONFIG_PTE_MARKER_UFFD_WP We can set !PTE_MARKER to completely disable all the PTE markers, along with uffd-wp support. I made two config so we can also enable PTE marker but disable uffd-wp file-backed for other purposes. At the end of current series, I'll enable CONFIG_PTE_MARKER by default, but that patch is standalone and if anyone worries about having it by default, we can also consider turn it off by dropping that oneliner patch. So far I don't see a huge risk of doing so, so I kept that patch. In most cases, PTE markers should be treated as none ptes. It is because that unlike most of the other swap entry types, there's no PFN or block offset information encoded into PTE markers but some extra well-defined bits showing the status of the pte. These bits should only be used as extra data when servicing an upcoming page fault, and then we behave as if it's a none pte. I did spend a lot of time observing all the pte_none() users this time. It is indeed a challenge because there're a lot, and I hope I didn't miss a single of them when we should take care of pte markers. Luckily, I don't think it'll need to be considered in many cases, for example: boot code, arch code (especially non-x86), kernel-only page handlings (e.g. CPA), or device driver codes when we're tackling with pure PFN mappings. I introduced pte_none_mostly() in this series when we need to handle pte markers the same as none pte, the "mostly" is the other way to write "either none pte or a pte marker". I didn't replace pte_none() to cover pte markers for below reasons: - Very rare case of pte_none() callers will handle pte markers. E.g., all the kernel pages do not require knowledge of pte markers. So we don't pollute the major use cases. - Unconditionally change pte_none() semantics could confuse people, because pte_none() existed for so long a time. - Unconditionally change pte_none() semantics could make pte_none() slower even if in many cases pte markers do not exist. - There're cases where we'd like to handle pte markers differntly from pte_none(), so a full replace is also impossible. E.g. khugepaged should still treat pte markers as normal swap ptes rather than none ptes, because pte markers will always need a fault-in to merge the marker with a valid pte. Or the smap code will need to parse PTE markers not none ptes. Patch Layout ============ Introducing PTE marker and uffd-wp bit in PTE marker: mm: Introduce PTE_MARKER swap entry mm: Teach core mm about pte markers mm: Check against orig_pte for finish_fault() mm/uffd: PTE_MARKER_UFFD_WP Adding support for shmem uffd-wp: mm/shmem: Take care of UFFDIO_COPY_MODE_WP mm/shmem: Handle uffd-wp special pte in page fault handler mm/shmem: Persist uffd-wp bit across zapping for file-backed mm/shmem: Allow uffd wr-protect none pte for file-backed mem mm/shmem: Allows file-back mem to be uffd wr-protected on thps mm/shmem: Handle uffd-wp during fork() Adding support for hugetlbfs uffd-wp: mm/hugetlb: Introduce huge pte version of uffd-wp helpers mm/hugetlb: Hook page faults for uffd write protection mm/hugetlb: Take care of UFFDIO_COPY_MODE_WP mm/hugetlb: Handle UFFDIO_WRITEPROTECT mm/hugetlb: Handle pte markers in page faults mm/hugetlb: Allow uffd wr-protect none ptes mm/hugetlb: Only drop uffd-wp special pte if required mm/hugetlb: Handle uffd-wp during fork() Misc handling on the rest mm for uffd-wp file-backed: mm/khugepaged: Don't recycle vma pgtable if uffd-wp registered mm/pagemap: Recognize uffd-wp bit for shmem/hugetlbfs Enabling of uffd-wp on file-backed memory: mm/uffd: Enable write protection for shmem & hugetlbfs mm: Enable PTE markers by default selftests/uffd: Enable uffd-wp for shmem/hugetlbfs Tests ===== - Compile test on x86_64 and aarch64 on different configs - Kernel selftests - uffd-test [0] - Umapsort [1,2] test for shmem/hugetlb, with swap on/off [0] https://github.com/xzpeter/clibs/tree/master/uffd-test [1] https://github.com/xzpeter/umap-apps/tree/peter [2] https://github.com/xzpeter/umap/tree/peter-shmem-hugetlbfs This patch (of 23): Introduces a new swap entry type called PTE_MARKER. It can be installed for any pte that maps a file-backed memory when the pte is temporarily zapped, so as to maintain per-pte information. The information that kept in the pte is called a "marker". Here we define the marker as "unsigned long" just to match pgoff_t, however it will only work if it still fits in swp_offset(), which is e.g. currently 58 bits on x86_64. A new config CONFIG_PTE_MARKER is introduced too; it's by default off. A bunch of helpers are defined altogether to service the rest of the pte marker code. [peterx@redhat.com: fixup] Link: https://lkml.kernel.org/r/Yk2rdB7SXZf+2BDF@xz-m1.local Link: https://lkml.kernel.org/r/20220405014646.13522-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20220405014646.13522-2-peterx@redhat.com Signed-off-by: Peter Xu Cc: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Alistair Popple Cc: Nadav Amit Cc: Axel Rasmussen Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Mike Rapoport Signed-off-by: Andrew Morton --- arch/s390/include/asm/hugetlb.h | 5 +++ include/asm-generic/hugetlb.h | 9 +++++ include/linux/swap.h | 15 +++++++- include/linux/swapops.h | 78 +++++++++++++++++++++++++++++++++++++++++ mm/Kconfig | 6 ++++ 5 files changed, 112 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index bea47e7cc6a0..f0bc4d31e852 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -85,6 +85,11 @@ static inline int huge_pte_none(pte_t pte) return pte_none(pte); } +static inline int huge_pte_none_mostly(pte_t pte) +{ + return huge_pte_none(pte); +} + static inline int huge_pte_write(pte_t pte) { return pte_write(pte); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 8e1e6244a89d..f39cad20ffc6 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -2,6 +2,9 @@ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H +#include +#include + static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) { return mk_pte(page, pgprot); @@ -80,6 +83,12 @@ static inline int huge_pte_none(pte_t pte) } #endif +/* Please refer to comments above pte_none_mostly() for the usage */ +static inline int huge_pte_none_mostly(pte_t pte) +{ + return huge_pte_none(pte) || is_pte_marker(pte); +} + #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 7daae5a4b3e1..5553189d0215 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,19 @@ static inline int current_is_kswapd(void) * actions on faults. */ +/* + * PTE markers are used to persist information onto PTEs that are mapped with + * file-backed memories. As its name "PTE" hints, it should only be applied to + * the leaves of pgtables. + */ +#ifdef CONFIG_PTE_MARKER +#define SWP_PTE_MARKER_NUM 1 +#define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM) +#else +#define SWP_PTE_MARKER_NUM 0 +#endif + /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/vm/hmm.rst. Short description is we need struct pages for @@ -107,7 +120,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 82265b055a0d..bb70da461b7d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -274,6 +274,84 @@ static inline int is_readable_migration_entry(swp_entry_t entry) #endif +typedef unsigned long pte_marker; + +#define PTE_MARKER_MASK (0) + +#ifdef CONFIG_PTE_MARKER + +static inline swp_entry_t make_pte_marker_entry(pte_marker marker) +{ + return swp_entry(SWP_PTE_MARKER, marker); +} + +static inline bool is_pte_marker_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_PTE_MARKER; +} + +static inline pte_marker pte_marker_get(swp_entry_t entry) +{ + return swp_offset(entry) & PTE_MARKER_MASK; +} + +static inline bool is_pte_marker(pte_t pte) +{ + return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); +} + +#else /* CONFIG_PTE_MARKER */ + +static inline swp_entry_t make_pte_marker_entry(pte_marker marker) +{ + /* This should never be called if !CONFIG_PTE_MARKER */ + WARN_ON_ONCE(1); + return swp_entry(0, 0); +} + +static inline bool is_pte_marker_entry(swp_entry_t entry) +{ + return false; +} + +static inline pte_marker pte_marker_get(swp_entry_t entry) +{ + return 0; +} + +static inline bool is_pte_marker(pte_t pte) +{ + return false; +} + +#endif /* CONFIG_PTE_MARKER */ + +static inline pte_t make_pte_marker(pte_marker marker) +{ + return swp_entry_to_pte(make_pte_marker_entry(marker)); +} + +/* + * This is a special version to check pte_none() just to cover the case when + * the pte is a pte marker. It existed because in many cases the pte marker + * should be seen as a none pte; it's just that we have stored some information + * onto the none pte so it becomes not-none any more. + * + * It should be used when the pte is file-backed, ram-based and backing + * userspace pages, like shmem. It is not needed upon pgtables that do not + * support pte markers at all. For example, it's not needed on anonymous + * memory, kernel-only memory (including when the system is during-boot), + * non-ram based generic file-system. It's fine to be used even there, but the + * extra pte marker check will be pure overhead. + * + * For systems configured with !CONFIG_PTE_MARKER this will be automatically + * optimized to pte_none(). + */ +static inline int pte_none_mostly(pte_t pte) +{ + return pte_none(pte) || is_pte_marker(pte); +} + static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset(entry)); diff --git a/mm/Kconfig b/mm/Kconfig index 3f7b6d7b69df..2262f211b53e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -909,6 +909,12 @@ config ANON_VMA_NAME area from being merged with adjacent virtual memory areas due to the difference in their name. +config PTE_MARKER + bool "Marker PTEs support" + + help + Allows to create marker PTEs for file-backed memory. + source "mm/damon/Kconfig" endmenu -- cgit v1.2.3 From 5c041f5d1f23d3a172dd0db3215634c484b4acd6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm: teach core mm about pte markers This patch still does not use pte marker in any way, however it teaches the core mm about the pte marker idea. For example, handle_pte_marker() is introduced that will parse and handle all the pte marker faults. Many of the places are more about commenting it up - so that we know there's the possibility of pte marker showing up, and why we don't need special code for the cases. [peterx@redhat.com: userfaultfd.c needs swapops.h] Link: https://lkml.kernel.org/r/YmRlVj3cdizYJsr0@xz-m1.local Link: https://lkml.kernel.org/r/20220405014833.14015-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 11 +++++++---- mm/filemap.c | 5 +++++ mm/hmm.c | 2 +- mm/memcontrol.c | 8 ++++++-- mm/memory.c | 23 +++++++++++++++++++++++ mm/mincore.c | 3 ++- mm/mprotect.c | 3 +++ 7 files changed, 47 insertions(+), 8 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index aa0c47cb0d16..78b68e0f9774 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,6 +29,7 @@ #include #include #include +#include int sysctl_unprivileged_userfaultfd __read_mostly; @@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (huge_pte_none(pte)) + if (huge_pte_none_mostly(pte)) ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; @@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte = pte_offset_map(pmd, address); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (pte_none(*pte)) + if (pte_none_mostly(*pte)) ret = true; if (!pte_write(*pte) && (reason & VM_UFFD_WP)) ret = true; diff --git a/mm/filemap.c b/mm/filemap.c index 9a1eef6c5d35..de34c6c10384 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3376,6 +3376,11 @@ again: vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit the + * fault-around logic. + */ if (!pte_none(*vmf->pte)) goto unlock; diff --git a/mm/hmm.c b/mm/hmm.c index af71aac3140e..3fd3242c5e50 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -239,7 +239,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_t pte = *ptep; uint64_t pfn_req_flags = *hmm_pfn; - if (pte_none(pte)) { + if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 235c85e17c2d..6e74ca98c862 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5640,10 +5640,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); + else if (pte_none_mostly(ptent)) + /* + * PTE markers should be treated as a none pte here, separated + * from other swap handling below. + */ + page = mc_handle_file_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); - else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; diff --git a/mm/memory.c b/mm/memory.c index 90212057a546..9743c8b74bf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -100,6 +100,8 @@ struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif +static vm_fault_t do_fault(struct vm_fault *vmf); + /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end @@ -1415,6 +1417,8 @@ again: if (!should_zap_page(details, page)) continue; rss[mm_counter(page)]--; + } else if (is_pte_marker_entry(entry)) { + /* By default, simply drop all pte markers when zap */ } else if (is_hwpoison_entry(entry)) { if (!should_zap_cows(details)) continue; @@ -3555,6 +3559,23 @@ static inline bool should_try_to_free_swap(struct page *page, page_count(page) == 2; } +static vm_fault_t handle_pte_marker(struct vm_fault *vmf) +{ + swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); + unsigned long marker = pte_marker_get(entry); + + /* + * PTE markers should always be with file-backed memories, and the + * marker should never be empty. If anything weird happened, the best + * thing to do is to kill the process along with its mm. + */ + if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + return VM_FAULT_SIGBUS; + + /* TODO: handle pte markers */ + return 0; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3592,6 +3613,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; + } else if (is_pte_marker_entry(entry)) { + ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; diff --git a/mm/mincore.c b/mm/mincore.c index f4f627325e12..fa200c14185f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -122,7 +122,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (; addr != end; ptep++, addr += PAGE_SIZE) { pte_t pte = *ptep; - if (pte_none(pte)) + /* We need to do cache lookup too for pte markers */ + if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 20a46f21cca8..e84694267b0f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -193,6 +193,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_pte_marker_entry(entry)) { + /* Skip it, the same as none pte */ + continue; } else { newpte = oldpte; } -- cgit v1.2.3 From f46f2adecdcc1ba0799383e67fe98f65f41fea5c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm: check against orig_pte for finish_fault() This patch allows do_fault() to trigger on !pte_none() cases too. This prepares for the pte markers to be handled by do_fault() just like none pte. To achieve this, instead of unconditionally check against pte_none() in finish_fault(), we may hit the case that the orig_pte was some pte marker so what we want to do is to replace the pte marker with some valid pte entry. Then if orig_pte was set we'd want to check the current *pte (under pgtable lock) against orig_pte rather than none pte. Right now there's no solid way to safely reference orig_pte because when pmd is not allocated handle_pte_fault() will not initialize orig_pte, so it's not safe to reference it. There's another solution proposed before this patch to do pte_clear() for vmf->orig_pte for pmd==NULL case, however it turns out it'll break arm32 because arm32 could have assumption that pte_t* pointer will always reside on a real ram32 pgtable, not any kernel stack variable. To solve this, we add a new flag FAULT_FLAG_ORIG_PTE_VALID, and it'll be set along with orig_pte when there is valid orig_pte, or it'll be cleared when orig_pte was not initialized. It'll be updated every time we call handle_pte_fault(), so e.g. if a page fault retry happened it'll be properly updated along with orig_pte. [1] https://lore.kernel.org/lkml/710c48c9-406d-e4c5-a394-10501b951316@samsung.com/ [akpm@linux-foundation.org: coding-style cleanups] [peterx@redhat.com: fix crash reported by Marek] Link: https://lkml.kernel.org/r/Ylb9rXJyPm8/ao8f@xz-m1.local Link: https://lkml.kernel.org/r/20220405014836.14077-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Tested-by: Marek Szyprowski Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 +++ mm/memory.c | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7216d77a5884..dd382270ae40 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -822,6 +822,8 @@ typedef struct { * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark * exclusive) a possibly shared anonymous page that is * mapped R/O. + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -858,6 +860,7 @@ enum fault_flag { FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, }; #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/memory.c b/mm/memory.c index 9743c8b74bf2..878da420d97b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4183,6 +4183,14 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } +static bool vmf_pte_changed(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) + return !pte_same(*vmf->pte, vmf->orig_pte); + + return !pte_none(*vmf->pte); +} + /** * finish_fault - finish page fault once we have prepared the page to fault * @@ -4241,7 +4249,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) vmf->address, &vmf->ptl); ret = 0; /* Re-check under ptl */ - if (likely(pte_none(*vmf->pte))) + if (likely(!vmf_pte_changed(vmf))) do_set_pte(vmf, page, vmf->address); else ret = VM_FAULT_NOPAGE; @@ -4709,6 +4717,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * concurrent faults and from rmap lookups. */ vmf->pte = NULL; + vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { /* * If a huge pmd materialized under us just retry later. Use @@ -4732,6 +4741,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); vmf->orig_pte = *vmf->pte; + vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; /* * some architectures can have larger ptes than wordsize, -- cgit v1.2.3 From 1db9dbc2ef05205bf1022f9b14aa29b1dd8efd7e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm/uffd: PTE_MARKER_UFFD_WP This patch introduces the 1st user of pte marker: the uffd-wp marker. When the pte marker is installed with the uffd-wp bit set, it means this pte was wr-protected by uffd. We will use this special pte to arm the ptes that got either unmapped or swapped out for a file-backed region that was previously wr-protected. This special pte could trigger a page fault just like swap entries. This idea is greatly inspired by Hugh and Andrea in the discussion, which is referenced in the links below. Some helpers are introduced to detect whether a swap pte is uffd wr-protected. After the pte marker introduced, one swap pte can be wr-protected in two forms: either it is a normal swap pte and it has _PAGE_SWP_UFFD_WP set, or it's a pte marker that has PTE_MARKER_UFFD_WP set. [peterx@redhat.com: fixup] Link: https://lkml.kernel.org/r/YkzKiM8tI4+qOfXF@xz-m1.local Link: https://lore.kernel.org/lkml/20201126222359.8120-1-peterx@redhat.com/ Link: https://lore.kernel.org/lkml/20201130230603.46187-1-peterx@redhat.com/ Link: https://lkml.kernel.org/r/20220405014838.14131-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Hugh Dickins Cc: Alistair Popple Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/swapops.h | 3 ++- include/linux/userfaultfd_k.h | 47 +++++++++++++++++++++++++++++++++++++++++++ mm/Kconfig | 9 +++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bb70da461b7d..09945342bf76 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -276,7 +276,8 @@ static inline int is_readable_migration_entry(swp_entry_t entry) typedef unsigned long pte_marker; -#define PTE_MARKER_MASK (0) +#define PTE_MARKER_UFFD_WP BIT(0) +#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) #ifdef CONFIG_PTE_MARKER diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 33cea484d1ad..3354d9ddc35f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -15,6 +15,8 @@ #include #include +#include +#include #include /* The set of all possible UFFD-related VM flags. */ @@ -236,4 +238,49 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, #endif /* CONFIG_USERFAULTFD */ +static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); +#else + return false; +#endif +} + +static inline bool pte_marker_uffd_wp(pte_t pte) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + swp_entry_t entry; + + if (!is_swap_pte(pte)) + return false; + + entry = pte_to_swp_entry(pte); + + return pte_marker_entry_uffd_wp(entry); +#else + return false; +#endif +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (!is_swap_pte(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_marker_uffd_wp(pte)) + return true; +#endif + return false; +} + #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 2262f211b53e..71c653260a44 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -915,6 +915,15 @@ config PTE_MARKER help Allows to create marker PTEs for file-backed memory. +config PTE_MARKER_UFFD_WP + bool "Marker PTEs support for userfaultfd write protection" + depends on PTE_MARKER && HAVE_ARCH_USERFAULTFD_WP + + help + Allows to create marker PTEs for userfaultfd write protection + purposes. It is required to enable userfaultfd write protection on + file-backed memory types like shmem and hugetlbfs. + source "mm/damon/Kconfig" endmenu -- cgit v1.2.3 From 8ee79edff6d3b43b2d0c1e41f92b32e128988b22 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm/shmem: take care of UFFDIO_COPY_MODE_WP Pass wp_copy into shmem_mfill_atomic_pte() through the stack, then apply the UFFD_WP bit properly when the UFFDIO_COPY on shmem is with UFFDIO_COPY_MODE_WP. wp_copy lands mfill_atomic_install_pte() finally. Note: we must do pte_wrprotect() if !writable in mfill_atomic_install_pte(), as mk_pte() could return a writable pte (e.g., when VM_SHARED on a shmem file). Link: https://lkml.kernel.org/r/20220405014841.14185-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 ++-- mm/shmem.c | 4 ++-- mm/userfaultfd.c | 23 ++++++++++++++++++----- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3e915cc550bc..a68f982f22d1 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -145,11 +145,11 @@ extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, + bool zeropage, bool wp_copy, struct page **pagep); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ - src_addr, zeropage, pagep) ({ BUG(); 0; }) + src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ diff --git a/mm/shmem.c b/mm/shmem.c index 913c0b039333..4a4a2e628289 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2319,7 +2319,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, + bool zeropage, bool wp_copy, struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); @@ -2392,7 +2392,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - page, true, false); + page, true, wp_copy); if (ret) goto out_delete_from_cache; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9cdaed939203..2b6ec3352147 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -78,10 +78,19 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, * Always mark a PTE as write-protected when needed, regardless of * VM_WRITE, which the user might change. */ - if (wp_copy) + if (wp_copy) { _dst_pte = pte_mkuffd_wp(_dst_pte); - else if (writable) + writable = false; + } + + if (writable) _dst_pte = pte_mkwrite(_dst_pte); + else + /* + * We need this to make sure write bit removed; as mk_pte() + * could return a pte with write bit set. + */ + _dst_pte = pte_wrprotect(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); @@ -96,7 +105,12 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, } ret = -EEXIST; - if (!pte_none(*dst_pte)) + /* + * We allow to overwrite a pte marker: consider when both MISSING|WP + * registered, we firstly wr-protect a none pte which has no page cache + * page backing it, then access the page. + */ + if (!pte_none_mostly(*dst_pte)) goto out_unlock; if (page_in_cache) { @@ -480,11 +494,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); } else { - VM_WARN_ON_ONCE(wp_copy); err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, mode != MCOPY_ATOMIC_NORMAL, - page); + wp_copy, page); } return err; -- cgit v1.2.3 From 9c28a205c06123b9f0a0c4d819ece9f5f552d004 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: handle uffd-wp special pte in page fault handler File-backed memories are prone to unmap/swap so the ptes are always unstable, because they can be easily faulted back later using the page cache. This could lead to uffd-wp getting lost when unmapping or swapping out such memory. One example is shmem. PTE markers are needed to store those information. This patch prepares it by handling uffd-wp pte markers first it is applied elsewhere, so that the page fault handler can recognize uffd-wp pte markers. The handling of uffd-wp pte markers is similar to missing fault, it's just that we'll handle this "missing fault" when we see the pte markers, meanwhile we need to make sure the marker information is kept during processing the fault. This is a slow path of uffd-wp handling, because zapping of wr-protected shmem ptes should be rare. So far it should only trigger in two conditions: (1) When trying to punch holes in shmem_fallocate(), there is an optimization to zap the pgtables before evicting the page. (2) When swapping out shmem pages. Because of this, the page fault handling is simplifed too by not sending the wr-protect message in the 1st page fault, instead the page will be installed read-only, so the uffd-wp message will be generated in the next fault, which will trigger the do_wp_page() path of general uffd-wp handling. Disable fault-around for all uffd-wp registered ranges for extra safety just like uffd-minor fault, and clean the code up. Link: https://lkml.kernel.org/r/20220405014844.14239-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 17 +++++++++++ mm/memory.c | 67 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 75 insertions(+), 9 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3354d9ddc35f..e7afcdfd4b46 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -96,6 +96,18 @@ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } +/* + * Don't do fault around for either WP or MINOR registered uffd range. For + * MINOR registered range, fault around will be a total disaster and ptes can + * be installed without notifications; for WP it should mostly be fine as long + * as the fault around checks for pte_none() before the installation, however + * to be super safe we just forbid it. + */ +static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; @@ -236,6 +248,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) diff --git a/mm/memory.c b/mm/memory.c index 878da420d97b..c16f873373a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3559,6 +3559,39 @@ static inline bool should_try_to_free_swap(struct page *page, page_count(page) == 2; } +static vm_fault_t pte_marker_clear(struct vm_fault *vmf) +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + /* + * Be careful so that we will only recover a special uffd-wp pte into a + * none pte. Otherwise it means the pte could have changed, so retry. + */ + if (is_pte_marker(*vmf->pte)) + pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; +} + +/* + * This is actually a page-missing access, but with uffd-wp special pte + * installed. It means this pte was wr-protected before being unmapped. + */ +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) +{ + /* + * Just in case there're leftover special ptes even after the region + * got unregistered - we can simply clear them. We can also do that + * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp + * ranges, but it should be more efficient to be done lazily here. + */ + if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) + return pte_marker_clear(vmf); + + /* do_fault() can handle pte markers too like none pte */ + return do_fault(vmf); +} + static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); @@ -3572,8 +3605,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) return VM_FAULT_SIGBUS; - /* TODO: handle pte markers */ - return 0; + if (pte_marker_entry_uffd_wp(entry)) + return pte_marker_handle_uffd_wp(vmf); + + /* This is an unknown pte marker */ + return VM_FAULT_SIGBUS; } /* @@ -4157,6 +4193,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; + bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = vmf->address != addr; pte_t entry; @@ -4171,6 +4208,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (unlikely(uffd_wp)) + entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); @@ -4352,9 +4391,21 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); } +/* Return true if we should do read fault-around, false otherwise */ +static inline bool should_fault_around(struct vm_fault *vmf) +{ + /* No ->map_pages? No way to fault around... */ + if (!vmf->vma->vm_ops->map_pages) + return false; + + if (uffd_disable_fault_around(vmf->vma)) + return false; + + return fault_around_bytes >> PAGE_SHIFT > 1; +} + static vm_fault_t do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; /* @@ -4362,12 +4413,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - if (likely(!userfaultfd_minor(vmf->vma))) { - ret = do_fault_around(vmf); - if (ret) - return ret; - } + if (should_fault_around(vmf)) { + ret = do_fault_around(vmf); + if (ret) + return ret; } ret = __do_fault(vmf); -- cgit v1.2.3 From 999dad824c39ed14dee7c4412aae531ba9e74a90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: persist uffd-wp bit across zapping for file-backed File-backed memory is prone to being unmapped at any time. It means all information in the pte will be dropped, including the uffd-wp flag. To persist the uffd-wp flag, we'll use the pte markers. This patch teaches the zap code to understand uffd-wp and know when to keep or drop the uffd-wp bit. Add a new flag ZAP_FLAG_DROP_MARKER and set it in zap_details when we don't want to persist such an information, for example, when destroying the whole vma, or punching a hole in a shmem file. For the rest cases we should never drop the uffd-wp bit, or the wr-protect information will get lost. The new ZAP_FLAG_DROP_MARKER needs to be put into mm.h rather than memory.c because it'll be further referenced in hugetlb files later. Link: https://lkml.kernel.org/r/20220405014847.14295-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ++++++++++ include/linux/mm_inline.h | 43 +++++++++++++++++++++++++++++++++++++++++ mm/memory.c | 49 ++++++++++++++++++++++++++++++++++++++++++++--- mm/rmap.c | 8 ++++++++ 4 files changed, 107 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d63ba0e0e068..61786259e52a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3428,4 +3428,14 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif +typedef unsigned int __bitwise zap_flags_t; + +/* + * Whether to drop the pte markers, for example, the uffd-wp information for + * file-backed memory. This should only be specified when we will completely + * drop the page in the mm, either by truncation or unmapping of the vma. By + * default, the flag is not set. + */ +#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ac32125745ab..7b25b53c474a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -316,5 +318,46 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +/* + * If this pte is wr-protected by uffd-wp in any form, arm the special pte to + * replace a none pte. NOTE! This should only be called when *pte is already + * cleared so we will never accidentally replace something valuable. Meanwhile + * none pte also means we are not demoting the pte so tlb flushed is not needed. + * E.g., when pte cleared the caller should have taken care of the tlb flush. + * + * Must be called with pgtable lock held so that no thread will see the none + * pte, and if they see it, they'll fault and serialize at the pgtable lock. + * + * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + */ +static inline void +pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, + pte_t *pte, pte_t pteval) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + bool arm_uffd_pte = false; + + /* The current status of the pte should be "cleared" before calling */ + WARN_ON_ONCE(!pte_none(*pte)); + + if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) + return; + + /* A uffd-wp wr-protected normal pte */ + if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) + arm_uffd_pte = true; + + /* + * A uffd-wp wr-protected swap pte. Note: this should even cover an + * existing pte marker with uffd-wp bit set. + */ + if (unlikely(pte_swp_uffd_wp_any(pteval))) + arm_uffd_pte = true; + + if (unlikely(arm_uffd_pte)) + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif +} #endif diff --git a/mm/memory.c b/mm/memory.c index c16f873373a2..ecb1d58dcaef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -74,6 +74,7 @@ #include #include #include +#include #include @@ -1306,6 +1307,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* Whether we should zap all COWed (private) pages too */ @@ -1334,6 +1336,29 @@ static inline bool should_zap_page(struct zap_details *details, struct page *pag return !PageAnon(page); } +static inline bool zap_drop_file_uffd_wp(struct zap_details *details) +{ + if (!details) + return false; + + return details->zap_flags & ZAP_FLAG_DROP_MARKER; +} + +/* + * This function makes sure that we'll replace the none pte with an uffd-wp + * swap special pte marker when necessary. Must be with the pgtable lock held. + */ +static inline void +zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, + struct zap_details *details, pte_t pteval) +{ + if (zap_drop_file_uffd_wp(details)) + return; + + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1371,6 +1396,8 @@ again: ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); + zap_install_uffd_wp_if_needed(vma, addr, pte, details, + ptent); if (unlikely(!page)) continue; @@ -1401,6 +1428,13 @@ again: page = pfn_swap_entry_to_page(entry); if (unlikely(!should_zap_page(details, page))) continue; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(page)]--; if (is_device_private_entry(entry)) page_remove_rmap(page, vma, false); @@ -1417,8 +1451,10 @@ again: if (!should_zap_page(details, page)) continue; rss[mm_counter(page)]--; - } else if (is_pte_marker_entry(entry)) { - /* By default, simply drop all pte markers when zap */ + } else if (pte_marker_entry_uffd_wp(entry)) { + /* Only drop the uffd-wp marker if explicitly requested */ + if (!zap_drop_file_uffd_wp(details)) + continue; } else if (is_hwpoison_entry(entry)) { if (!should_zap_cows(details)) continue; @@ -1427,6 +1463,7 @@ again: WARN_ON_ONCE(1); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); @@ -1637,12 +1674,17 @@ void unmap_vmas(struct mmu_gather *tlb, unsigned long end_addr) { struct mmu_notifier_range range; + struct zap_details details = { + .zap_flags = ZAP_FLAG_DROP_MARKER, + /* Careful - we need to zap private pages too! */ + .even_cows = true, + }; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) - unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); + unmap_single_vma(tlb, vma, start_addr, end_addr, &details); mmu_notifier_invalidate_range_end(&range); } @@ -3438,6 +3480,7 @@ void unmap_mapping_folio(struct folio *folio) details.even_cows = false; details.single_folio = folio; + details.zap_flags = ZAP_FLAG_DROP_MARKER; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) diff --git a/mm/rmap.c b/mm/rmap.c index 72464378e1a6..94d6b24a1ac2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -1585,6 +1586,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, pteval = ptep_clear_flush(vma, address, pvmw.pte); } + /* + * Now the pte is cleared. If this pte was uffd-wp armed, + * we may want to replace a none pte with a marker pte if + * it's file-backed, so we don't lose the tracking info. + */ + pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); + /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); -- cgit v1.2.3 From fe2567eb552194e5b7a3c61bed5574658f859a11 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: allow uffd wr-protect none pte for file-backed mem File-backed memory differs from anonymous memory in that even if the pte is missing, the data could still resides either in the file or in page/swap cache. So when wr-protect a pte, we need to consider none ptes too. We do that by installing the uffd-wp pte markers when necessary. So when there's a future write to the pte, the fault handler will go the special path to first fault-in the page as read-only, then report to userfaultfd server with the wr-protect message. On the other hand, when unprotecting a page, it's also possible that the pte got unmapped but replaced by the special uffd-wp marker. Then we'll need to be able to recover from a uffd-wp pte marker into a none pte, so that the next access to the page will fault in correctly as usual when accessed the next time. Special care needs to be taken throughout the change_protection_range() process. Since now we allow user to wr-protect a none pte, we need to be able to pre-populate the page table entries if we see (!anonymous && MM_CP_UFFD_WP) requests, otherwise change_protection_range() will always skip when the pgtable entry does not exist. For example, the pgtable can be missing for a whole chunk of 2M pmd, but the page cache can exist for the 2M range. When we want to wr-protect one 4K page within the 2M pmd range, we need to pre-populate the pgtable and install the pte marker showing that we want to get a message and block the thread when the page cache of that 4K page is written. Without pre-populating the pmd, change_protection() will simply skip that whole pmd. Note that this patch only covers the small pages (pte level) but not covering any of the transparent huge pages yet. That will be done later, and this patch will be a preparation for it too. Link: https://lkml.kernel.org/r/20220405014850.14352-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/mprotect.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index e84694267b0f..763409e62ef6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -193,8 +194,16 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_pte_marker_entry(entry)) { - /* Skip it, the same as none pte */ + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); + pages++; + } continue; } else { newpte = oldpte; @@ -209,6 +218,20 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } + } else { + /* It must be an none page, or what else?.. */ + WARN_ON_ONCE(!pte_none(oldpte)); + if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. + */ + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -242,6 +265,39 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) return 0; } +/* Return true if we're uffd wr-protecting file-backed memory, or false */ +static inline bool +uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) +{ + return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); +} + +/* + * If wr-protecting the range for file-backed, populate pgtable for the case + * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() + * failed it means no memory, we don't have a better option but stop. + */ +#define change_pmd_prepare(vma, pmd, cp_flags) \ + do { \ + if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ + if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ + break; \ + } \ + } while (0) +/* + * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to + * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, + * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. + */ +#define change_prepare(vma, high, low, addr, cp_flags) \ + do { \ + if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ + low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ + if (WARN_ON_ONCE(p == NULL)) \ + break; \ + } \ + } while (0) + static inline unsigned long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -260,6 +316,7 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); + change_pmd_prepare(vma, pmd, cp_flags); /* * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur @@ -329,6 +386,7 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); + change_prepare(vma, pud, pmd, addr, cp_flags); if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, @@ -349,6 +407,7 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); + change_prepare(vma, p4d, pud, addr, cp_flags); if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, @@ -372,6 +431,7 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); + change_prepare(vma, pgd, p4d, addr, cp_flags); if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, -- cgit v1.2.3 From 019c2d8b959c9c18b04572f9c26c46be9f5df093 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: allows file-back mem to be uffd wr-protected on thps We don't have "huge" version of pte markers, instead when necessary we split the thp. However split the thp is not enough, because file-backed thp is handled totally differently comparing to anonymous thps: rather than doing a real split, the thp pmd will simply got cleared in __split_huge_pmd_locked(). That is not enough if e.g. when there is a thp covers range [0, 2M) but we want to wr-protect small page resides in [4K, 8K) range, because after __split_huge_pmd() returns, there will be a none pmd, and change_pmd_range() will just skip it right after the split. Here we leverage the previously introduced change_pmd_prepare() macro so that we'll populate the pmd with a pgtable page after the pmd split (in which process the pmd will be cleared for cases like shmem). Then change_pte_range() will do all the rest for us by installing the uffd-wp pte marker at any none pte that we'd like to wr-protect. Link: https://lkml.kernel.org/r/20220405014852.14413-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/mprotect.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 763409e62ef6..cccad7974b3b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -338,8 +338,15 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, } if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { - if (next - addr != HPAGE_PMD_SIZE) { + if ((next - addr != HPAGE_PMD_SIZE) || + uffd_wp_protect_file(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false, NULL); + /* + * For file-backed, the pmd could have been + * cleared; make sure pmd populated if + * necessary, then fall-through to pte level. + */ + change_pmd_prepare(vma, pmd, cp_flags); } else { /* * change_huge_pmd() does not defer TLB flushes, -- cgit v1.2.3 From c56d1b62cce83695823c13e52f73e92eb568c0c1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: handle uffd-wp during fork() Normally we skip copy page when fork() for VM_SHARED shmem, but we can't skip it anymore if uffd-wp is enabled on dst vma. This should only happen when the src uffd has UFFD_FEATURE_EVENT_FORK enabled on uffd-wp shmem vma, so that VM_UFFD_WP will be propagated onto dst vma too, then we should copy the pgtables with uffd-wp bit and pte markers, because these information will be lost otherwise. Since the condition checks will become even more complicated for deciding "whether a vma needs to copy the pgtable during fork()", introduce a helper vma_needs_copy() for it, so everything will be clearer. Link: https://lkml.kernel.org/r/20220405014855.14468-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/memory.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ecb1d58dcaef..8827157cf392 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -867,6 +867,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (try_restore_exclusive_pte(src_pte, src_vma, addr)) return -EBUSY; return -ENOENT; + } else if (is_pte_marker_entry(entry)) { + /* + * We're copying the pgtable should only because dst_vma has + * uffd-wp enabled, do sanity check. + */ + WARN_ON_ONCE(!userfaultfd_wp(dst_vma)); + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); @@ -1221,6 +1229,38 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } +/* + * Return true if the vma needs to copy the pgtable during this fork(). Return + * false when we can speed up fork() by allowing lazy page faults later until + * when the child accesses the memory range. + */ +bool +vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) +{ + /* + * Always copy pgtables when dst_vma has uffd-wp enabled even if it's + * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable + * contains uffd-wp protection information, that's something we can't + * retrieve from page cache, and skip copying will lose those info. + */ + if (userfaultfd_wp(dst_vma)) + return true; + + if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) + return true; + + if (src_vma->anon_vma) + return true; + + /* + * Don't copy ptes where a page fault will fill them correctly. Fork + * becomes much lighter when there are big shared or private readonly + * mappings. The tradeoff is that copy_page_range is more efficient + * than faulting. + */ + return false; +} + int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { @@ -1234,14 +1274,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) bool is_cow; int ret; - /* - * Don't copy ptes where a page fault will fill them correctly. - * Fork becomes much lighter when there are big shared or private - * readonly mappings. The tradeoff is that copy_page_range is more - * efficient than faulting. - */ - if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && - !src_vma->anon_vma) + if (!vma_needs_copy(dst_vma, src_vma)) return 0; if (is_vm_hugetlb_page(src_vma)) -- cgit v1.2.3 From 229f3fa778c50edf12b34bd91b2025645a11bb94 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: introduce huge pte version of uffd-wp helpers They will be used in the follow up patches to either check/set/clear uffd-wp bit of a huge pte. So far it reuses all the small pte helpers. Archs can overwrite these versions when necessary (with __HAVE_ARCH_HUGE_PTE_UFFD_WP* macros) in the future. Link: https://lkml.kernel.org/r/20220405014858.14531-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/s390/include/asm/hugetlb.h | 15 +++++++++++++++ include/asm-generic/hugetlb.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index f0bc4d31e852..32c3fd6e878e 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -120,6 +120,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +static inline pte_t huge_pte_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static inline int huge_pte_uffd_wp(pte_t pte) +{ + return 0; +} + static inline bool gigantic_page_runtime_supported(void) { return true; diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index f39cad20ffc6..896f341f614d 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -35,6 +35,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +static inline pte_t huge_pte_mkuffd_wp(pte_t pte) +{ + return pte_mkuffd_wp(pte); +} + +static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) +{ + return pte_clear_uffd_wp(pte); +} + +static inline int huge_pte_uffd_wp(pte_t pte) +{ + return pte_uffd_wp(pte); +} + #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) -- cgit v1.2.3 From 166f3ecc0daf0c164bd7e2f780dbcd1e213ac95f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: hook page faults for uffd write protection Hook up hugetlbfs_fault() with the capability to handle userfaultfd-wp faults. We do this slightly earlier than hugetlb_cow() so that we can avoid taking some extra locks that we definitely don't need. Link: https://lkml.kernel.org/r/20220405014901.14590-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/hugetlb.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fc31a8f49067..0a2b3b8e765b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5737,6 +5737,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_ptl; + /* Handle userfault-wp first, before trying to lock more pages */ + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; + + spin_unlock(ptl); + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + return handle_userfault(&vmf, VM_UFFD_WP); + } + /* * hugetlb_wp() requires page locks of pte_page(entry) and * pagecache_page, so here we need take the former one -- cgit v1.2.3 From 6041c69179034278ac6d57f90a55b09e588f4b90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: take care of UFFDIO_COPY_MODE_WP Pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout the stack. Apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with UFFDIO_COPY. Hugetlb pages are only managed by hugetlbfs, so we're safe even without setting dirty bit in the huge pte if the page is installed as read-only. However we'd better still keep the dirty bit set for a read-only UFFDIO_COPY pte (when UFFDIO_COPY_MODE_WP bit is set), not only to match what we do with shmem, but also because the page does contain dirty data that the kernel just copied from the userspace. Link: https://lkml.kernel.org/r/20220405014904.14643-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++-- mm/hugetlb.c | 29 +++++++++++++++++++++++------ mm/userfaultfd.c | 14 +++++++++----- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac2ece9e9c79..159b253e523d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -160,7 +160,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, - struct page **pagep); + struct page **pagep, + bool wp_copy); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -356,7 +357,8 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, - struct page **pagep) + struct page **pagep, + bool wp_copy) { BUG(); return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a2b3b8e765b..2550b434cabd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5821,7 +5821,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, - struct page **pagep) + struct page **pagep, + bool wp_copy) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct hstate *h = hstate_vma(dst_vma); @@ -5951,7 +5952,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_unlock; ret = -EEXIST; - if (!huge_pte_none(huge_ptep_get(dst_pte))) + /* + * We allow to overwrite a pte marker: consider when both MISSING|WP + * registered, we firstly wr-protect a none pte which has no page cache + * page backing it, then access the page. + */ + if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; if (vm_shared) { @@ -5961,17 +5967,28 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ - if (is_continue && !vm_shared) + /* + * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY + * with wp flag set, don't set pte write bit. + */ + if (wp_copy || (is_continue && !vm_shared)) writable = 0; else writable = dst_vma->vm_flags & VM_WRITE; _dst_pte = make_huge_pte(dst_vma, page, writable); - if (writable) - _dst_pte = huge_pte_mkdirty(_dst_pte); + /* + * Always mark UFFDIO_COPY page dirty; note that this may not be + * extremely important for hugetlbfs for now since swapping is not + * supported, but we should still be clear in that this page cannot be + * thrown away at will, even if write bit not set. + */ + _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); + if (wp_copy) + _dst_pte = huge_pte_mkuffd_wp(_dst_pte); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2b6ec3352147..be2a61f4b55e 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -305,7 +305,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - enum mcopy_atomic_mode mode) + enum mcopy_atomic_mode mode, + bool wp_copy) { int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; @@ -393,7 +394,7 @@ retry: } if (mode != MCOPY_ATOMIC_CONTINUE && - !huge_pte_none(huge_ptep_get(dst_pte))) { + !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -401,7 +402,8 @@ retry: } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, - dst_addr, src_addr, mode, &page); + dst_addr, src_addr, mode, &page, + wp_copy); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -456,7 +458,8 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - enum mcopy_atomic_mode mode); + enum mcopy_atomic_mode mode, + bool wp_copy); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, @@ -576,7 +579,8 @@ retry: */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, mcopy_mode); + src_start, len, mcopy_mode, + wp_copy); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; -- cgit v1.2.3 From 5a90d5a103c2badfcf12d48e2fec350969e3f486 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: handle UFFDIO_WRITEPROTECT This starts from passing cp_flags into hugetlb_change_protection() so hugetlb will be able to handle MM_CP_UFFD_WP[_RESOLVE] requests. huge_pte_clear_uffd_wp() is introduced to handle the case where the UFFDIO_WRITEPROTECT is requested upon migrating huge page entries. Link: https://lkml.kernel.org/r/20220405014906.14708-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++-- mm/hugetlb.c | 13 ++++++++++++- mm/mprotect.c | 3 ++- mm/userfaultfd.c | 8 ++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 159b253e523d..f1143f1fb444 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -211,7 +211,8 @@ struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot); + unsigned long address, unsigned long end, pgprot_t newprot, + unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); @@ -397,7 +398,8 @@ static inline void move_hugetlb_state(struct page *oldpage, static inline unsigned long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, - unsigned long end, pgprot_t newprot) + unsigned long end, pgprot_t newprot, + unsigned long cp_flags) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2550b434cabd..4b8f413e011d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6233,7 +6233,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot) + unsigned long address, unsigned long end, + pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long start = address; @@ -6243,6 +6244,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long pages = 0; bool shared_pmd = false; struct mmu_notifier_range range; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * In the case of shared PMDs, the area to flush could be beyond @@ -6289,6 +6292,10 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, entry = make_readable_migration_entry( swp_offset(entry)); newpte = swp_entry_to_pte(entry); + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); set_huge_swap_pte_at(mm, address, ptep, newpte, huge_page_size(h)); pages++; @@ -6303,6 +6310,10 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); + if (uffd_wp) + pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); + else if (uffd_wp_resolve) + pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } diff --git a/mm/mprotect.c b/mm/mprotect.c index cccad7974b3b..ba5592655ee3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -460,7 +460,8 @@ unsigned long change_protection(struct mmu_gather *tlb, BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); if (is_vm_hugetlb_page(vma)) - pages = hugetlb_change_protection(vma, start, end, newprot); + pages = hugetlb_change_protection(vma, start, end, newprot, + cp_flags); else pages = change_protection_range(tlb, vma, start, end, newprot, cp_flags); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index be2a61f4b55e..01edc18902c5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -705,6 +705,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; + unsigned long page_mask; struct mmu_gather tlb; pgprot_t newprot; int err; @@ -742,6 +743,13 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, if (!vma_is_anonymous(dst_vma)) goto out_unlock; + if (is_vm_hugetlb_page(dst_vma)) { + err = -EINVAL; + page_mask = vma_kernel_pagesize(dst_vma) - 1; + if ((start & page_mask) || (len & page_mask)) + goto out_unlock; + } + if (enable_wp) newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); else -- cgit v1.2.3 From c64e912c865a1a0df1c312bca946985eb095afa5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: handle pte markers in page faults Allow hugetlb code to handle pte markers just like none ptes. It's mostly there, we just need to make sure we don't assume hugetlb_no_page() only handles none pte, so when detecting pte change we should use pte_same() rather than pte_none(). We need to pass in the old_pte to do the comparison. Check the original pte to see whether it's a pte marker, if it is, we should recover uffd-wp bit on the new pte to be installed, so that the next write will be trapped by uffd. Link: https://lkml.kernel.org/r/20220405014909.14761-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/hugetlb.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4b8f413e011d..25f4ac5ca6fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5438,7 +5438,8 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, unsigned int flags) + unsigned long address, pte_t *ptep, + pte_t old_pte, unsigned int flags) { struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; @@ -5565,7 +5566,8 @@ retry: ptl = huge_pte_lock(h, mm, ptep); ret = 0; - if (!huge_pte_none(huge_ptep_get(ptep))) + /* If pte changed from under us, retry */ + if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; if (anon_rmap) { @@ -5575,6 +5577,12 @@ retry: page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); + /* + * If this pte was previously wr-protected, keep it wr-protected even + * if populated. + */ + if (unlikely(pte_marker_uffd_wp(old_pte))) + new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); @@ -5692,8 +5700,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); - if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); + /* PTE markers should be handled the same way as none pte */ + if (huge_pte_none_mostly(entry)) { + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + entry, flags); goto out_mutex; } -- cgit v1.2.3 From 60dfaad65aa97fb6755b9798a6b3c9e79bcd5930 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/hugetlb: allow uffd wr-protect none ptes Teach hugetlbfs code to wr-protect none ptes just in case the page cache existed for that pte. Meanwhile we also need to be able to recognize a uffd-wp marker pte and remove it for uffd_wp_resolve. Since at it, introduce a variable "psize" to replace all references to the huge page size fetcher. Link: https://lkml.kernel.org/r/20220405014912.14815-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/hugetlb.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 25f4ac5ca6fe..ec9774ed84c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6251,7 +6251,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0; + unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; @@ -6271,13 +6271,19 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); - for (; address < end; address += huge_page_size(h)) { + for (; address < end; address += psize) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, address, psize); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, &address, ptep)) { + /* + * When uffd-wp is enabled on the vma, unshare + * shouldn't happen at all. Warn about it if it + * happened due to some reason. + */ + WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); shared_pmd = true; @@ -6307,12 +6313,20 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); set_huge_swap_pte_at(mm, address, ptep, - newpte, huge_page_size(h)); + newpte, psize); pages++; } spin_unlock(ptl); continue; } + if (unlikely(pte_marker_uffd_wp(pte))) { + /* + * This is changing a non-present pte into a none pte, + * no need for huge_ptep_modify_prot_start/commit(). + */ + if (uffd_wp_resolve) + huge_pte_clear(mm, address, ptep, psize); + } if (!huge_pte_none(pte)) { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@ -6326,6 +6340,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + } else { + /* None pte */ + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); } spin_unlock(ptl); } -- cgit v1.2.3 From 05e90bd05eea33fc77d6b11e121e2da01fee379f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/hugetlb: only drop uffd-wp special pte if required As with shmem uffd-wp special ptes, only drop the uffd-wp special swap pte if unmapping an entire vma or synchronized such that faults can not race with the unmap operation. This requires passing zap_flags all the way to the lowest level hugetlb unmap routine: __unmap_hugepage_range. In general, unmap calls originated in hugetlbfs code will pass the ZAP_FLAG_DROP_MARKER flag as synchronization is in place to prevent faults. The exception is hole punch which will first unmap without any synchronization. Later when hole punch actually removes the page from the file, it will check to see if there was a subsequent fault and if so take the hugetlb fault mutex while unmapping again. This second unmap will pass in ZAP_FLAG_DROP_MARKER. The justification of "whether to apply ZAP_FLAG_DROP_MARKER flag when unmap a hugetlb range" is (IMHO): we should never reach a state when a page fault could errornously fault in a page-cache page that was wr-protected to be writable, even in an extremely short period. That could happen if e.g. we pass ZAP_FLAG_DROP_MARKER when hugetlbfs_punch_hole() calls hugetlb_vmdelete_list(), because if a page faults after that call and before remove_inode_hugepages() is executed, the page cache can be mapped writable again in the small racy window, that can cause unexpected data overwritten. [peterx@redhat.com: fix sparse warning] Link: https://lkml.kernel.org/r/Ylcdw8I1L5iAoWhb@xz-m1.local [akpm@linux-foundation.org: move zap_flags_t from mm.h to mm_types.h to fix build issues] Link: https://lkml.kernel.org/r/20220405014915.14873-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 15 +++++++++------ include/linux/hugetlb.h | 8 +++++--- include/linux/mm.h | 2 -- include/linux/mm_types.h | 2 ++ mm/hugetlb.c | 33 +++++++++++++++++++++++++-------- mm/memory.c | 5 ++++- 6 files changed, 45 insertions(+), 20 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 591599829e2a..5945caccf003 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -405,7 +405,8 @@ static void remove_huge_page(struct page *page) } static void -hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, + zap_flags_t zap_flags) { struct vm_area_struct *vma; @@ -439,7 +440,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) } unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL); + NULL, zap_flags); } } @@ -517,7 +518,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h)); + (index + 1) * pages_per_huge_page(h), + ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); } @@ -583,7 +585,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_mmap_lock_write(mapping); i_size_write(inode, offset); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) - hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, + ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); } @@ -616,8 +619,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, - hole_start >> PAGE_SHIFT, - hole_end >> PAGE_SHIFT); + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f1143f1fb444..19cec415f546 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -143,11 +143,12 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, unsigned long *, unsigned long *, long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *); + unsigned long, unsigned long, struct page *, + zap_flags_t); void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page); + struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo(void); @@ -406,7 +407,8 @@ static inline unsigned long hugetlb_change_protection( static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { BUG(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 61786259e52a..de32c0383387 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3428,8 +3428,6 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif -typedef unsigned int __bitwise zap_flags_t; - /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dd382270ae40..b34ff2cdbc4f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -863,4 +863,6 @@ enum fault_flag { FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, }; +typedef unsigned int __bitwise zap_flags_t; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec9774ed84c0..99281aecbd28 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4973,7 +4973,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page) + struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -5029,7 +5029,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep, sz); + /* + * If the pte was wr-protected by uffd-wp in any of the + * swap forms, meanwhile the caller does not want to + * drop the uffd-wp bit in this zap, then replace the + * pte with a marker. + */ + if (pte_swp_uffd_wp_any(pte) && + !(zap_flags & ZAP_FLAG_DROP_MARKER)) + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); + else + huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } @@ -5057,7 +5068,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); - + /* Leave a uffd-wp pte marker if needed */ + if (huge_pte_uffd_wp(pte) && + !(zap_flags & ZAP_FLAG_DROP_MARKER)) + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); @@ -5091,9 +5106,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { - __unmap_hugepage_range(tlb, vma, start, end, ref_page); + __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); /* * Clear this flag so that x86's huge_pmd_share page_table_shareable @@ -5109,12 +5125,13 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); tlb_finish_mmu(&tlb); } @@ -5169,7 +5186,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, - address + huge_page_size(h), page); + address + huge_page_size(h), page, 0); } i_mmap_unlock_write(mapping); } diff --git a/mm/memory.c b/mm/memory.c index 8827157cf392..82adda885605 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1675,8 +1675,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, * safe to do nothing in this case. */ if (vma->vm_file) { + zap_flags_t zap_flags = details ? + details->zap_flags : 0; i_mmap_lock_write(vma->vm_file->f_mapping); - __unmap_hugepage_range_final(tlb, vma, start, end, NULL); + __unmap_hugepage_range_final(tlb, vma, start, end, + NULL, zap_flags); i_mmap_unlock_write(vma->vm_file->f_mapping); } } else -- cgit v1.2.3 From bc70fbf269fdff410b0b6d75c3770b9f59117b90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/hugetlb: handle uffd-wp during fork() Firstly, we'll need to pass in dst_vma into copy_hugetlb_page_range() because for uffd-wp it's the dst vma that matters on deciding how we should treat uffd-wp protected ptes. We should recognize pte markers during fork and do the pte copy if needed. [lkp@intel.com: vma_needs_copy can be static] Link: https://lkml.kernel.org/r/Ylb0CGeFJlc4EzLk@7ec4ff11d4ae Link: https://lkml.kernel.org/r/20220405014918.14932-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +++++-- mm/hugetlb.c | 42 ++++++++++++++++++++++++++++-------------- mm/memory.c | 4 ++-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 19cec415f546..04f0186b089b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -137,7 +137,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len); -int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); +int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, + struct vm_area_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -269,7 +270,9 @@ static inline struct page *follow_huge_addr(struct mm_struct *mm, } static inline int copy_hugetlb_page_range(struct mm_struct *dst, - struct mm_struct *src, struct vm_area_struct *vma) + struct mm_struct *src, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { BUG(); return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 99281aecbd28..01f0e2e5ab48 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4719,23 +4719,24 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; - bool cow = is_cow_mapping(vma->vm_flags); - struct hstate *h = hstate_vma(vma); + bool cow = is_cow_mapping(src_vma->vm_flags); + struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, - vma->vm_start, - vma->vm_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + src_vma->vm_start, + src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); @@ -4749,12 +4750,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, i_mmap_lock_read(mapping); } - for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, vma, addr, sz); + dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -4789,6 +4790,7 @@ again: } else if (unlikely(is_hugetlb_entry_migration(entry) || is_hugetlb_entry_hwpoisoned(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); + bool uffd_wp = huge_pte_uffd_wp(entry); if (!is_readable_migration_entry(swp_entry) && cow) { /* @@ -4798,10 +4800,21 @@ again: swp_entry = make_readable_migration_entry( swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); + if (userfaultfd_wp(src_vma) && uffd_wp) + entry = huge_pte_mkuffd_wp(entry); set_huge_swap_pte_at(src, addr, src_pte, entry, sz); } + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + } else if (unlikely(is_pte_marker(entry))) { + /* + * We copy the pte marker only if the dst vma has + * uffd-wp enabled. + */ + if (userfaultfd_wp(dst_vma)) + set_huge_pte_at(dst, addr, dst_pte, entry); } else { entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); @@ -4819,20 +4832,21 @@ again: */ if (!PageAnon(ptepage)) { page_dup_file_rmap(ptepage, true); - } else if (page_try_dup_anon_rmap(ptepage, true, vma)) { + } else if (page_try_dup_anon_rmap(ptepage, true, + src_vma)) { pte_t src_pte_old = entry; struct page *new; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(vma, addr, 1); + new = alloc_huge_page(dst_vma, addr, 1); if (IS_ERR(new)) { put_page(ptepage); ret = PTR_ERR(new); break; } - copy_user_huge_page(new, ptepage, addr, vma, + copy_user_huge_page(new, ptepage, addr, dst_vma, npages); put_page(ptepage); @@ -4842,13 +4856,13 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { - restore_reserve_on_error(h, vma, addr, + restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); /* dst_entry won't change as in child */ goto again; } - hugetlb_install_page(vma, dst_pte, addr, new); + hugetlb_install_page(dst_vma, dst_pte, addr, new); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; diff --git a/mm/memory.c b/mm/memory.c index 82adda885605..f4161fb07ffa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1234,7 +1234,7 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, * false when we can speed up fork() by allowing lazy page faults later until * when the child accesses the memory range. */ -bool +static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { /* @@ -1278,7 +1278,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return 0; if (is_vm_hugetlb_page(src_vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); + return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* -- cgit v1.2.3 From deb4c93a9871082a7df6a99ca8fa48024665a71a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/khugepaged: don't recycle vma pgtable if uffd-wp registered When we're trying to collapse a 2M huge shmem page, don't retract pgtable pmd page if it's registered with uffd-wp, because that pgtable could have pte markers installed. Recycling of that pgtable means we'll lose the pte markers. That could cause data loss for an uffd-wp enabled application on shmem. Instead of disabling khugepaged on these files, simply skip retracting these special VMAs, then the page cache can still be merged into a huge thp, and other mm/vma can still map the range of file with a huge thp when proper. Note that checking VM_UFFD_WP needs to be done with mmap_sem held for write, that avoids race like: khugepaged user thread ========== =========== check VM_UFFD_WP, not set UFFDIO_REGISTER with uffd-wp on shmem wr-protect some pages (install markers) take mmap_sem write lock erase pmd and free pmd page --> pte markers are dropped unnoticed! Link: https://lkml.kernel.org/r/20220405014921.14994-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a2560f970881..2243ed095f02 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1456,6 +1456,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) return; + /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ + if (userfaultfd_wp(vma)) + return; + hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) @@ -1591,7 +1595,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * reverse order. Trylock is a way to avoid deadlock. */ if (mmap_write_trylock(mm)) { - if (!khugepaged_test_exit(mm)) + /* + * When a vma is registered with uffd-wp, we can't + * recycle the pmd pgtable because there can be pte + * markers installed. Skip it only, so the rest mm/vma + * can still have the same file mapped hugely, however + * it'll always mapped in small page size for uffd-wp + * registered ranges. + */ + if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { -- cgit v1.2.3 From 8e165e733bfa06edbcdbe491ef13b2bf1a3fa883 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/pagemap: recognize uffd-wp bit for shmem/hugetlbfs This requires the pagemap code to be able to recognize the newly introduced swap special pte for uffd-wp, meanwhile the general case for hugetlb that we recently start to support. It should make pagemap uffd-wp support complete. Link: https://lkml.kernel.org/r/20220405014923.15047-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a843d13e2e1a..2d04e3470d4c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1421,6 +1421,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); + if (pte_marker_entry_uffd_wp(entry)) + flags |= PM_UFFD_WP; } if (page && !PageAnon(page)) @@ -1556,10 +1558,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; + if (huge_pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); + } else if (pte_swp_uffd_wp_any(pte)) { + flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { -- cgit v1.2.3 From b1f9e876862d8f7176299ec4fb2108bc1045cbc8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:56 -0700 Subject: mm/uffd: enable write protection for shmem & hugetlbfs We've had all the necessary changes ready for both shmem and hugetlbfs. Turn on all the shmem/hugetlbfs switches for userfaultfd-wp. We can expand UFFD_API_RANGE_IOCTLS_BASIC with _UFFDIO_WRITEPROTECT too because all existing types now support write protection mode. Since vma_can_userfault() will be used elsewhere, move into userfaultfd_k.h. Link: https://lkml.kernel.org/r/20220405014926.15101-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 21 +++------------------ include/linux/userfaultfd_k.h | 20 ++++++++++++++++++++ include/uapi/linux/userfaultfd.h | 10 ++++++++-- mm/userfaultfd.c | 9 +++------ 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 78b68e0f9774..e943370107d0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1258,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* FIXME: add WP support to hugetlbfs and shmem */ - if (vm_flags & VM_UFFD_WP) { - if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) - return false; - } - - if (vm_flags & VM_UFFD_MINOR) { - if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) - return false; - } - - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} - static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1956,6 +1938,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #endif #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; +#endif +#ifndef CONFIG_PTE_MARKER_UFFD_WP + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e7afcdfd4b46..732b522bacb7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -18,6 +18,7 @@ #include #include #include +#include /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) @@ -140,6 +141,25 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + if (vm_flags & VM_UFFD_MINOR) + return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); + +#ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem & hugetlbfs are not supported but only + * anonymous. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; +#endif + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index ef739054cb1c..7d32b1e797fb 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -33,7 +33,8 @@ UFFD_FEATURE_THREAD_ID | \ UFFD_FEATURE_MINOR_HUGETLBFS | \ UFFD_FEATURE_MINOR_SHMEM | \ - UFFD_FEATURE_EXACT_ADDRESS) + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -47,7 +48,8 @@ #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ - (__u64)1 << _UFFDIO_CONTINUE) + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT) /* * Valid ioctl command number range with this API is from 0x00 to @@ -194,6 +196,9 @@ struct uffdio_api { * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page * faults would be provided and the offset within the page would not be * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -207,6 +212,7 @@ struct uffdio_api { #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) #define UFFD_FEATURE_MINOR_SHMEM (1<<10) #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) __u64 features; __u64 ioctls; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 01edc18902c5..4f4892a5f767 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -732,15 +732,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, err = -ENOENT; dst_vma = find_dst_vma(dst_mm, start, len); - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + + if (!dst_vma) goto out_unlock; if (!userfaultfd_wp(dst_vma)) goto out_unlock; - if (!vma_is_anonymous(dst_vma)) + if (!vma_can_userfault(dst_vma, dst_vma->vm_flags)) goto out_unlock; if (is_vm_hugetlb_page(dst_vma)) { -- cgit v1.2.3 From 81e0f15f2ef6dad7ccb9c03d8e61ef7ded836b38 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:56 -0700 Subject: mm: enable PTE markers by default Enable PTE markers by default. On x86_64 it means it'll auto-enable PTE_MARKER_UFFD_WP as well. [peterx@redhat.com: hide PTE_MARKER option] Link: https://lkml.kernel.org/r/20220419202531.27415-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20220405014929.15158-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/Kconfig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 71c653260a44..fd6c9fe4b6ed 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -910,14 +910,16 @@ config ANON_VMA_NAME difference in their name. config PTE_MARKER - bool "Marker PTEs support" + bool help Allows to create marker PTEs for file-backed memory. config PTE_MARKER_UFFD_WP - bool "Marker PTEs support for userfaultfd write protection" - depends on PTE_MARKER && HAVE_ARCH_USERFAULTFD_WP + bool "Userfaultfd write protection support for shmem/hugetlbfs" + default y + depends on HAVE_ARCH_USERFAULTFD_WP + select PTE_MARKER help Allows to create marker PTEs for userfaultfd write protection -- cgit v1.2.3 From c0eeeb02d9df878c71a457008900b650d94bd0d9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:56 -0700 Subject: selftests/uffd: enable uffd-wp for shmem/hugetlbfs After we added support for shmem and hugetlbfs, we can turn uffd-wp test on always now. Link: https://lkml.kernel.org/r/20220405014932.15212-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 92a4516f8f0d..bbc4a6d8cf7b 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -82,7 +82,7 @@ static int test_type; static volatile bool test_uffdio_copy_eexist = true; static volatile bool test_uffdio_zeropage_eexist = true; /* Whether to test uffd write-protection */ -static bool test_uffdio_wp = false; +static bool test_uffdio_wp = true; /* Whether to test uffd minor faults */ static bool test_uffdio_minor = false; @@ -1594,8 +1594,6 @@ static void set_test_type(const char *type) if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; - /* Only enable write-protect test for anonymous test */ - test_uffdio_wp = true; } else if (!strcmp(type, "hugetlb")) { test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; -- cgit v1.2.3 From 1bf0831383c6b372ff870d061ee62156635035c2 Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Thu, 12 May 2022 20:22:56 -0700 Subject: userfaultfd/selftests: use swap() instead of open coding it Address the following coccicheck warning: tools/testing/selftests/vm/userfaultfd.c:1536:21-22: WARNING opportunity for swap(). tools/testing/selftests/vm/userfaultfd.c:1540:33-34: WARNING opportunity for swap(). by using swap() for the swapping of variable values and drop `tmp_area` that is not needed any more. `swap()` macro in userfaultfd.c is introduced in commit 681696862bc18 ("selftests: vm: remove dependecy from internal kernel macros") It has been tested with gcc (Debian 8.3.0-6) 8.3.0. Link: https://lkml.kernel.org/r/20220407123141.4998-1-guozhengkui@vivo.com Signed-off-by: Guo Zhengkui Reviewed-by: Muchun Song Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index bbc4a6d8cf7b..0bdfc1955229 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -1422,7 +1422,6 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) static int userfaultfd_stress(void) { void *area; - char *tmp_area; unsigned long nr; struct uffdio_register uffdio_register; struct uffd_stats uffd_stats[nr_cpus]; @@ -1533,13 +1532,9 @@ static int userfaultfd_stress(void) count_verify[nr], nr); /* prepare next bounce */ - tmp_area = area_src; - area_src = area_dst; - area_dst = tmp_area; + swap(area_src, area_dst); - tmp_area = area_src_alias; - area_src_alias = area_dst_alias; - area_dst_alias = tmp_area; + swap(area_src_alias, area_dst_alias); uffd_stats_report(uffd_stats, nr_cpus); } -- cgit v1.2.3 From 430529b5c631749329c61113703c7a9b6cf72973 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:56 -0700 Subject: mm/uffd: move USERFAULTFD configs into mm/ We used to have USERFAULTFD configs stored in init/. It makes sense as a start because that's the default place for storing syscall related configs. However userfaultfd evolved a bit in the past few years and some more config options were added. They're no longer related to syscalls and start to be not suitable to be kept in the init/ directory anymore, because they're pure mm concepts. But it's not ideal either to keep the userfaultfd configs separate from each other. Hence this patch moves the userfaultfd configs under init/ to be under mm/ so that we'll start to group all userfaultfd configs together. We do have quite a few examples of syscall related configs that are not put under init/Kconfig: FTRACE_SYSCALLS, SWAP, FILE_LOCKING, MEMFD_CREATE.. They all reside in the dir where they're more suitable for the concept. So it seems there's no restriction to keep the role of having syscall related CONFIG_* under init/ only. Link: https://lkml.kernel.org/r/20220420144823.35277-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Reviewed-by: Axel Rasmussen Signed-off-by: Andrew Morton --- init/Kconfig | 17 ----------------- mm/Kconfig | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9..b062cd745de6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1667,16 +1667,6 @@ config ADVISE_SYSCALLS applications use these syscalls, you can disable this option to save space. -config HAVE_ARCH_USERFAULTFD_WP - bool - help - Arch has userfaultfd write protection support - -config HAVE_ARCH_USERFAULTFD_MINOR - bool - help - Arch has userfaultfd minor fault support - config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y @@ -1741,13 +1731,6 @@ config KALLSYMS_BASE_RELATIVE # syscall, maps, verifier -config USERFAULTFD - bool "Enable userfaultfd() system call" - depends on MMU - help - Enable the userfaultfd() system call that allows to intercept and - handle page faults in userland. - config ARCH_HAS_MEMBARRIER_CALLBACKS bool diff --git a/mm/Kconfig b/mm/Kconfig index fd6c9fe4b6ed..c2141dd639e3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -909,6 +909,23 @@ config ANON_VMA_NAME area from being merged with adjacent virtual memory areas due to the difference in their name. +config USERFAULTFD + bool "Enable userfaultfd() system call" + depends on MMU + help + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + +config HAVE_ARCH_USERFAULTFD_WP + bool + help + Arch has userfaultfd write protection support + +config HAVE_ARCH_USERFAULTFD_MINOR + bool + help + Arch has userfaultfd minor fault support + config PTE_MARKER bool -- cgit v1.2.3 From f0cdaa5687d3178f3759033a7ff8411720b61647 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 12 May 2022 20:22:56 -0700 Subject: cgroups: refactor children cgroups in memcg tests Patch series "Fix bugs in memcontroller cgroup tests", v2. tools/testing/selftests/cgroup/test_memcontrol.c contains a set of testcases which validate expected behavior of the cgroup memory controller. Roman Gushchin recently sent out a patchset that fixed a few issues in the test. This patchset continues that effort by fixing a few more issues that were causing non-deterministic failures in the suite. With this patchset, I'm unable to reproduce any more errors after running the tests in a continuous loop for many iterations. Before, I was able to reproduce at least one of the errors fixed in this patchset with just one or two runs. This patch (of 5): In test_memcg_min() and test_memcg_low(), there is an array of four sibling cgroups. All but one of these sibling groups does a 50MB allocation, and the group that does no allocation is the third of four in the array. This is not a problem per se, but makes it a bit tricky to do some assertions in test_memcg_low(), as we want to make assertions on the siblings based on whether or not they performed allocations. Having a static index before which all groups have performed an allocation makes this cleaner. This patch therefore reorders the sibling groups so that the group that performs no allocations is the last in the array. A follow-on patch will leverage this to fix a bug in the test that incorrectly asserts that a sibling group that had performed an allocation, but only had protection from its parent, will not observe any memory.events.low events during reclaim. Link: https://lkml.kernel.org/r/20220423155619.3669555-1-void@manifault.com Link: https://lkml.kernel.org/r/20220423155619.3669555-2-void@manifault.com Signed-off-by: David Vernet Acked-by: Roman Gushchin Cc: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 28 ++++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index ee7defb17280..d240a391f99e 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -248,8 +248,8 @@ static int cg_test_proc_killed(const char *cgroup) * A/B memory.min = 50M, memory.current = 50M * A/B/C memory.min = 75M, memory.current = 50M * A/B/D memory.min = 25M, memory.current = 50M - * A/B/E memory.min = 500M, memory.current = 0 - * A/B/F memory.min = 0, memory.current = 50M + * A/B/E memory.min = 0, memory.current = 50M + * A/B/F memory.min = 500M, memory.current = 0 * * Usages are pagecache, but the test keeps a running * process in every leaf cgroup. @@ -259,7 +259,7 @@ static int cg_test_proc_killed(const char *cgroup) * A/B memory.current ~= 50M * A/B/C memory.current ~= 33M * A/B/D memory.current ~= 17M - * A/B/E memory.current ~= 0 + * A/B/F memory.current ~= 0 * * After that it tries to allocate more than there is * unprotected memory in A available, and checks @@ -325,7 +325,7 @@ static int test_memcg_min(const char *root) if (cg_create(children[i])) goto cleanup; - if (i == 2) + if (i > 2) continue; cg_run_nowait(children[i], alloc_pagecache_50M_noexit, @@ -340,9 +340,9 @@ static int test_memcg_min(const char *root) goto cleanup; if (cg_write(children[1], "memory.min", "25M")) goto cleanup; - if (cg_write(children[2], "memory.min", "500M")) + if (cg_write(children[2], "memory.min", "0")) goto cleanup; - if (cg_write(children[3], "memory.min", "0")) + if (cg_write(children[3], "memory.min", "500M")) goto cleanup; attempts = 0; @@ -368,7 +368,7 @@ static int test_memcg_min(const char *root) if (!values_close(c[1], MB(17), 10)) goto cleanup; - if (!values_close(c[2], 0, 1)) + if (c[3] != 0) goto cleanup; if (!cg_run(parent[2], alloc_anon, (void *)MB(170))) @@ -405,8 +405,8 @@ cleanup: * A/B memory.low = 50M, memory.current = 50M * A/B/C memory.low = 75M, memory.current = 50M * A/B/D memory.low = 25M, memory.current = 50M - * A/B/E memory.low = 500M, memory.current = 0 - * A/B/F memory.low = 0, memory.current = 50M + * A/B/E memory.low = 0, memory.current = 50M + * A/B/F memory.low = 500M, memory.current = 0 * * Usages are pagecache. * Then it creates A/G an creates a significant @@ -416,7 +416,7 @@ cleanup: * A/B memory.current ~= 50M * A/B/ memory.current ~= 33M * A/B/D memory.current ~= 17M - * A/B/E memory.current ~= 0 + * A/B/F memory.current ~= 0 * * After that it tries to allocate more than there is * unprotected memory in A available, @@ -480,7 +480,7 @@ static int test_memcg_low(const char *root) if (cg_create(children[i])) goto cleanup; - if (i == 2) + if (i > 2) continue; if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd)) @@ -495,9 +495,9 @@ static int test_memcg_low(const char *root) goto cleanup; if (cg_write(children[1], "memory.low", "25M")) goto cleanup; - if (cg_write(children[2], "memory.low", "500M")) + if (cg_write(children[2], "memory.low", "0")) goto cleanup; - if (cg_write(children[3], "memory.low", "0")) + if (cg_write(children[3], "memory.low", "500M")) goto cleanup; if (cg_run(parent[2], alloc_anon, (void *)MB(148))) @@ -515,7 +515,7 @@ static int test_memcg_low(const char *root) if (!values_close(c[1], MB(17), 10)) goto cleanup; - if (!values_close(c[2], 0, 1)) + if (c[3] != 0) goto cleanup; if (cg_run(parent[2], alloc_anon, (void *)MB(166))) { -- cgit v1.2.3 From cdc69458a5f3d4cf31372efd45fe92cec6b167e4 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 12 May 2022 20:22:57 -0700 Subject: cgroup: account for memory_recursiveprot in test_memcg_low() The test_memcg_low() testcase in test_memcontrol.c verifies the expected behavior of groups using the memory.low knob. Part of the testcase verifies that a group with memory.low that experiences reclaim due to memory pressure elsewhere in the system, observes memory.events.low events as a result of that reclaim. In commit 8a931f801340 ("mm: memcontrol: recursive memory.low protection"), the memory controller was updated to propagate memory.low and memory.min protection from a parent group to its children via a configurable memory_recursiveprot mount option. This unfortunately broke the memcg tests, which asserts that a sibling that experienced reclaim but had a memory.low value of 0, would not observe any memory.low events. This patch updates test_memcg_low() to account for the new behavior introduced by memory_recursiveprot. So as to make the test resilient to multiple configurations, the patch also adds a new proc_mount_contains() helper that checks for a string in /proc/mounts, and is used to toggle behavior based on whether the default memory_recursiveprot was present. Link: https://lkml.kernel.org/r/20220423155619.3669555-3-void@manifault.com Signed-off-by: David Vernet Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/cgroup_util.c | 12 ++++++++++++ tools/testing/selftests/cgroup/cgroup_util.h | 1 + tools/testing/selftests/cgroup/test_memcontrol.c | 16 +++++++++++++--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index e6f3679cdcc0..b4d7027a44c3 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -528,6 +528,18 @@ int set_oom_adj_score(int pid, int score) return 0; } +int proc_mount_contains(const char *option) +{ + char buf[4 * PAGE_SIZE]; + ssize_t read; + + read = read_text("/proc/mounts", buf, sizeof(buf)); + if (read < 0) + return read; + + return strstr(buf, option) != NULL; +} + ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 628738532ac9..756f76052b44 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -48,6 +48,7 @@ extern int is_swap_enabled(void); extern int set_oom_adj_score(int pid, int score); extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); +int proc_mount_contains(const char *option); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index d240a391f99e..4da138d05acb 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -21,6 +21,8 @@ #include "../kselftest.h" #include "cgroup_util.h" +static bool has_recursiveprot; + /* * This test creates two nested cgroups with and without enabling * the memory controller. @@ -525,15 +527,18 @@ static int test_memcg_low(const char *root) } for (i = 0; i < ARRAY_SIZE(children); i++) { + int no_low_events_index = has_recursiveprot ? 2 : 1; + oom = cg_read_key_long(children[i], "memory.events", "oom "); low = cg_read_key_long(children[i], "memory.events", "low "); if (oom) goto cleanup; - if (i < 2 && low <= 0) + if (i <= no_low_events_index && low <= 0) goto cleanup; - if (i >= 2 && low) + if (i > no_low_events_index && low) goto cleanup; + } ret = KSFT_PASS; @@ -1382,7 +1387,7 @@ struct memcg_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i, proc_status, ret = EXIT_SUCCESS; if (cg_find_unified_root(root, sizeof(root))) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -1398,6 +1403,11 @@ int main(int argc, char **argv) if (cg_write(root, "cgroup.subtree_control", "+memory")) ksft_exit_skip("Failed to set memory controller\n"); + proc_status = proc_mount_contains("memory_recursiveprot"); + if (proc_status < 0) + ksft_exit_skip("Failed to query cgroup mount option\n"); + has_recursiveprot = proc_status; + for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { case KSFT_PASS: -- cgit v1.2.3 From 72b1e03aa7255094d15752952a7e56c5f39b6e37 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 12 May 2022 20:22:57 -0700 Subject: cgroup: account for memory_localevents in test_memcg_oom_group_leaf_events() The test_memcg_oom_group_leaf_events() testcase in the cgroup memcg tests validates that processes in a group that perform allocations exceeding memory.oom.group are killed. It also validates that the memory.events.oom_kill events are properly propagated in this case. Commit 06e11c907ea4 ("kselftests: memcg: update the oom group leaf events test") fixed test_memcg_oom_group_leaf_events() to account for the fact that the memory.events.oom_kill events in a child cgroup is propagated up to its parent. This behavior can actually be configured by the memory_localevents mount option, so this patch updates the testcase to properly account for the possible presence of this mount option. Link: https://lkml.kernel.org/r/20220423155619.3669555-4-void@manifault.com Signed-off-by: David Vernet Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 4da138d05acb..31d5c3f9b2b2 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -21,6 +21,7 @@ #include "../kselftest.h" #include "cgroup_util.h" +static bool has_localevents; static bool has_recursiveprot; /* @@ -1200,6 +1201,7 @@ static int test_memcg_oom_group_leaf_events(const char *root) { int ret = KSFT_FAIL; char *parent, *child; + long parent_oom_events; parent = cg_name(root, "memcg_test_0"); child = cg_name(root, "memcg_test_0/memcg_test_1"); @@ -1345,14 +1347,20 @@ static int test_memcg_oom_group_score_events(const char *root) if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; - if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3) - goto cleanup; + parent_oom_events = cg_read_key_long( + parent, "memory.events", "oom_kill "); + /* + * If memory_localevents is not enabled (the default), the parent should + * count OOM events in its children groups. Otherwise, it should not + * have observed any events. + */ + if ((has_localevents && parent_oom_events == 0) || + parent_oom_events > 0) + ret = KSFT_PASS; if (kill(safe_pid, SIGKILL)) goto cleanup; - ret = KSFT_PASS; - cleanup: if (memcg) cg_destroy(memcg); @@ -1361,7 +1369,6 @@ cleanup: return ret; } - #define T(x) { x, #x } struct memcg_test { int (*fn)(const char *root); @@ -1408,6 +1415,11 @@ int main(int argc, char **argv) ksft_exit_skip("Failed to query cgroup mount option\n"); has_recursiveprot = proc_status; + proc_status = proc_mount_contains("memory_localevents"); + if (proc_status < 0) + ksft_exit_skip("Failed to query cgroup mount option\n"); + has_localevents = proc_status; + for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { case KSFT_PASS: -- cgit v1.2.3 From 830316807e0275146cbd5d2ae66fd338d0dfd09e Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 12 May 2022 20:22:57 -0700 Subject: cgroup: remove racy check in test_memcg_sock() test_memcg_sock() in the cgroup memcg tests, verifies expected memory accounting for sockets. The test forks a process which functions as a TCP server, and sends large buffers back and forth between itself (as the TCP client) and the forked TCP server. While doing so, it verifies that memory.current and memory.stat.sock look correct. There is currently a check in tcp_client() which asserts memory.current >= memory.stat.sock. This check is racy, as between memory.current and memory.stat.sock being queried, a packet could come in which causes mem_cgroup_charge_skmem() to be invoked. This could cause memory.stat.sock to exceed memory.current. Reversing the order of querying doesn't address the problem either, as memory may be reclaimed between the two calls. Instead, this patch just removes that assertion altogether, and instead relies on the values_close() check that follows to validate the expected accounting. Link: https://lkml.kernel.org/r/20220423155619.3669555-5-void@manifault.com Signed-off-by: David Vernet Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 31d5c3f9b2b2..6d3aace6be53 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1102,9 +1102,6 @@ static int tcp_client(const char *cgroup, unsigned short port) if (current < 0 || sock < 0) goto close_sk; - if (current < sock) - goto close_sk; - if (values_close(current, sock, 10)) { ret = KSFT_PASS; break; -- cgit v1.2.3 From c1a31a2f7a9c08665f95a16c40b3551af43cb95c Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 12 May 2022 20:22:57 -0700 Subject: cgroup: fix racy check in alloc_pagecache_max_30M() helper function alloc_pagecache_max_30M() in the cgroup memcg tests performs a 50MB pagecache allocation, which it expects to be capped at 30MB due to the calling process having a memory.high setting of 30MB. After the allocation, the function contains a check that verifies that MB(29) < memory.current <= MB(30). This check can actually fail non-deterministically. The testcases that use this function are test_memcg_high() and test_memcg_max(), which set memory.min and memory.max to 30MB respectively for the cgroup under test. The allocation can slightly exceed this number in both cases, and for memory.max, the process performing the allocation will not have the OOM killer invoked as it's performing a pagecache allocation. This patchset therefore updates the above check to instead use the verify_close() helper function. Link: https://lkml.kernel.org/r/20220423155619.3669555-6-void@manifault.com Signed-off-by: David Vernet Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 6d3aace6be53..6ab94317c87b 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -568,9 +568,14 @@ static int alloc_pagecache_max_30M(const char *cgroup, void *arg) { size_t size = MB(50); int ret = -1; - long current; + long current, high, max; int fd; + high = cg_read_long(cgroup, "memory.high"); + max = cg_read_long(cgroup, "memory.max"); + if (high != MB(30) && max != MB(30)) + goto cleanup; + fd = get_temp_fd(); if (fd < 0) return -1; @@ -579,7 +584,7 @@ static int alloc_pagecache_max_30M(const char *cgroup, void *arg) goto cleanup; current = cg_read_long(cgroup, "memory.current"); - if (current <= MB(29) || current > MB(30)) + if (!values_close(current, MB(30), 5)) goto cleanup; ret = 0; -- cgit v1.2.3 From b48d8a8e5ce53e3114a1ffe96563e3555b51d40b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:57 -0700 Subject: mm: page_isolation: move has_unmovable_pages() to mm/page_isolation.c Patch series "Use pageblock_order for cma and alloc_contig_range alignment", v11. This patchset tries to remove the MAX_ORDER-1 alignment requirement for CMA and alloc_contig_range(). It prepares for my upcoming changes to make MAX_ORDER adjustable at boot time[1]. The MAX_ORDER - 1 alignment requirement comes from that alloc_contig_range() isolates pageblocks to remove free memory from buddy allocator but isolating only a subset of pageblocks within a page spanning across multiple pageblocks causes free page accounting issues. Isolated page might not be put into the right free list, since the code assumes the migratetype of the first pageblock as the whole free page migratetype. This is based on the discussion at [2]. To remove the requirement, this patchset: 1. isolates pages at pageblock granularity instead of max(MAX_ORDER_NR_PAEGS, pageblock_nr_pages); 2. splits free pages across the specified range or migrates in-use pages across the specified range then splits the freed page to avoid free page accounting issues (it happens when multiple pageblocks within a single page have different migratetypes); 3. only checks unmovable pages within the range instead of MAX_ORDER - 1 aligned range during isolation to avoid alloc_contig_range() failure when pageblocks within a MAX_ORDER - 1 aligned range are allocated separately. 4. returns pages not in the range as it did before. One optimization might come later: 1. make MIGRATE_ISOLATE a separate bit to be able to restore the original migratetypes when isolation fails in the middle of the range. [1] https://lore.kernel.org/linux-mm/20210805190253.2795604-1-zi.yan@sent.com/ [2] https://lore.kernel.org/linux-mm/d19fb078-cb9b-f60f-e310-fdeea1b947d2@redhat.com/ This patch (of 6): has_unmovable_pages() is only used in mm/page_isolation.c. Move it from mm/page_alloc.c and make it static. Link: https://lkml.kernel.org/r/20220425143118.2850746-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Oscar Salvador Reviewed-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Vlastimil Babka Cc: Mel Gorman Cc: Eric Ren Cc: Christophe Leroy Cc: Minchan Kim Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 - mm/page_alloc.c | 119 ----------------------------------------- mm/page_isolation.c | 119 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 121 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 572458016331..e14eddf6741a 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,8 +33,6 @@ static inline bool is_migrate_isolate(int migratetype) #define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 -struct page *has_unmovable_pages(struct zone *zone, struct page *page, - int migratetype, int flags); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ddb0575c6b4a..0756f046b644 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8918,125 +8918,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* - * This function checks whether pageblock includes unmovable pages or not. - * - * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable - * check without lock_page also may miss some movable non-lru pages at - * race condition. So you can't expect this function should be exact. - * - * Returns a page without holding a reference. If the caller wants to - * dereference that page (e.g., dumping), it has to make sure that it - * cannot get removed (e.g., via memory unplug) concurrently. - * - */ -struct page *has_unmovable_pages(struct zone *zone, struct page *page, - int migratetype, int flags) -{ - unsigned long iter = 0; - unsigned long pfn = page_to_pfn(page); - unsigned long offset = pfn % pageblock_nr_pages; - - if (is_migrate_cma_page(page)) { - /* - * CMA allocations (alloc_contig_range) really need to mark - * isolate CMA pageblocks even when they are not movable in fact - * so consider them movable here. - */ - if (is_migrate_cma(migratetype)) - return NULL; - - return page; - } - - for (; iter < pageblock_nr_pages - offset; iter++) { - page = pfn_to_page(pfn + iter); - - /* - * Both, bootmem allocations and memory holes are marked - * PG_reserved and are unmovable. We can even have unmovable - * allocations inside ZONE_MOVABLE, for example when - * specifying "movablecore". - */ - if (PageReserved(page)) - return page; - - /* - * If the zone is movable and we have ruled out all reserved - * pages then it should be reasonably safe to assume the rest - * is movable. - */ - if (zone_idx(zone) == ZONE_MOVABLE) - continue; - - /* - * Hugepages are not in LRU lists, but they're movable. - * THPs are on the LRU, but need to be counted as #small pages. - * We need not scan over tail pages because we don't - * handle each tail page individually in migration. - */ - if (PageHuge(page) || PageTransCompound(page)) { - struct page *head = compound_head(page); - unsigned int skip_pages; - - if (PageHuge(page)) { - if (!hugepage_migration_supported(page_hstate(head))) - return page; - } else if (!PageLRU(head) && !__PageMovable(head)) { - return page; - } - - skip_pages = compound_nr(head) - (page - head); - iter += skip_pages - 1; - continue; - } - - /* - * We can't use page_count without pin a page - * because another CPU can free compound page. - * This check already skips compound tails of THP - * because their page->_refcount is zero at all time. - */ - if (!page_ref_count(page)) { - if (PageBuddy(page)) - iter += (1 << buddy_order(page)) - 1; - continue; - } - - /* - * The HWPoisoned page may be not in buddy system, and - * page_count() is not 0. - */ - if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) - continue; - - /* - * We treat all PageOffline() pages as movable when offlining - * to give drivers a chance to decrement their reference count - * in MEM_GOING_OFFLINE in order to indicate that these pages - * can be offlined as there are no direct references anymore. - * For actually unmovable PageOffline() where the driver does - * not support this, we will fail later when trying to actually - * move these pages that still have a reference count > 0. - * (false negatives in this function only) - */ - if ((flags & MEMORY_OFFLINE) && PageOffline(page)) - continue; - - if (__PageMovable(page) || PageLRU(page)) - continue; - - /* - * If there are RECLAIMABLE pages, we need to check - * it. But now, memory offline itself doesn't call - * shrink_node_slabs() and it still to be fixed. - */ - return page; - } - return NULL; -} - #ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index ff0ea6308299..df49f86a6ed1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,6 +15,125 @@ #define CREATE_TRACE_POINTS #include +/* + * This function checks whether pageblock includes unmovable pages or not. + * + * PageLRU check without isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. + * + * Returns a page without holding a reference. If the caller wants to + * dereference that page (e.g., dumping), it has to make sure that it + * cannot get removed (e.g., via memory unplug) concurrently. + * + */ +static struct page *has_unmovable_pages(struct zone *zone, struct page *page, + int migratetype, int flags) +{ + unsigned long iter = 0; + unsigned long pfn = page_to_pfn(page); + unsigned long offset = pfn % pageblock_nr_pages; + + if (is_migrate_cma_page(page)) { + /* + * CMA allocations (alloc_contig_range) really need to mark + * isolate CMA pageblocks even when they are not movable in fact + * so consider them movable here. + */ + if (is_migrate_cma(migratetype)) + return NULL; + + return page; + } + + for (; iter < pageblock_nr_pages - offset; iter++) { + page = pfn_to_page(pfn + iter); + + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ + if (PageReserved(page)) + return page; + + /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + continue; + + /* + * Hugepages are not in LRU lists, but they're movable. + * THPs are on the LRU, but need to be counted as #small pages. + * We need not scan over tail pages because we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page) || PageTransCompound(page)) { + struct page *head = compound_head(page); + unsigned int skip_pages; + + if (PageHuge(page)) { + if (!hugepage_migration_supported(page_hstate(head))) + return page; + } else if (!PageLRU(head) && !__PageMovable(head)) { + return page; + } + + skip_pages = compound_nr(head) - (page - head); + iter += skip_pages - 1; + continue; + } + + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_refcount is zero at all time. + */ + if (!page_ref_count(page)) { + if (PageBuddy(page)) + iter += (1 << buddy_order(page)) - 1; + continue; + } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + continue; + + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) + continue; + + if (__PageMovable(page) || PageLRU(page)) + continue; + + /* + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. + */ + return page; + } + return NULL; +} + static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { struct zone *zone = page_zone(page); -- cgit v1.2.3 From 844fbae63e468e32e3a3970868602a4bb658aff9 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm: page_isolation: check specified range for unmovable pages Enable set_migratetype_isolate() to check specified range for unmovable pages during isolation to prepare arbitrary range page isolation. The functionality will take effect in upcoming commits by adjusting the callers of start_isolate_page_range(), which uses set_migratetype_isolate(). For example, alloc_contig_range(), which calls start_isolate_page_range(), accepts unaligned ranges, but because page isolation is currently done at MAX_ORDER_NR_PAEGS granularity, pages that are out of the specified range but withint MAX_ORDER_NR_PAEGS alignment might be attempted for isolation and the failure of isolating these unrelated pages fails the whole operation undesirably. Link: https://lkml.kernel.org/r/20220425143118.2850746-3-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: kernel test robot Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_isolation.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index df49f86a6ed1..c2f7a8bb634d 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -16,7 +16,9 @@ #include /* - * This function checks whether pageblock includes unmovable pages or not. + * This function checks whether the range [start_pfn, end_pfn) includes + * unmovable pages or not. The range must fall into a single pageblock and + * consequently belong to a single zone. * * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable @@ -28,12 +30,15 @@ * cannot get removed (e.g., via memory unplug) concurrently. * */ -static struct page *has_unmovable_pages(struct zone *zone, struct page *page, - int migratetype, int flags) +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags) { - unsigned long iter = 0; - unsigned long pfn = page_to_pfn(page); - unsigned long offset = pfn % pageblock_nr_pages; + struct page *page = pfn_to_page(start_pfn); + struct zone *zone = page_zone(page); + unsigned long pfn; + + VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) != + ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages)); if (is_migrate_cma_page(page)) { /* @@ -47,8 +52,8 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page, return page; } - for (; iter < pageblock_nr_pages - offset; iter++) { - page = pfn_to_page(pfn + iter); + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + page = pfn_to_page(pfn); /* * Both, bootmem allocations and memory holes are marked @@ -85,7 +90,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page, } skip_pages = compound_nr(head) - (page - head); - iter += skip_pages - 1; + pfn += skip_pages - 1; continue; } @@ -97,7 +102,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page, */ if (!page_ref_count(page)) { if (PageBuddy(page)) - iter += (1 << buddy_order(page)) - 1; + pfn += (1 << buddy_order(page)) - 1; continue; } @@ -134,11 +139,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page, return NULL; } -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) +/* + * This function set pageblock migratetype to isolate if no unmovable page is + * present in [start_pfn, end_pfn). The pageblock must intersect with + * [start_pfn, end_pfn). + */ +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags, + unsigned long start_pfn, unsigned long end_pfn) { struct zone *zone = page_zone(page); struct page *unmovable; unsigned long flags; + unsigned long check_unmovable_start, check_unmovable_end; spin_lock_irqsave(&zone->lock, flags); @@ -155,8 +167,16 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ /* * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. + * + * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock + * to avoid redundant checks. */ - unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags); + check_unmovable_start = max(page_to_pfn(page), start_pfn); + check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages), + end_pfn); + + unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, + migratetype, isol_flags); if (!unmovable) { unsigned long nr_pages; int mt = get_pageblock_migratetype(page); @@ -313,7 +333,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && set_migratetype_isolate(page, migratetype, flags)) { + if (page && set_migratetype_isolate(page, migratetype, flags, + start_pfn, end_pfn)) { undo_isolate_page_range(start_pfn, pfn, migratetype); return -EBUSY; } -- cgit v1.2.3 From b2c9e2fbba32539626522b6aed30d1dde7b7e971 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm: make alloc_contig_range work at pageblock granularity alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid merging pageblocks with different migratetypes. It might unnecessarily convert extra pageblocks at the beginning and at the end of the range. Change alloc_contig_range() to work at pageblock granularity. Special handling is needed for free pages and in-use pages across the boundaries of the range specified by alloc_contig_range(). Because these= Partially isolated pages causes free page accounting issues. The free pages will be split and freed into separate migratetype lists; the in-use= Pages will be migrated then the freed pages will be handled in the aforementioned way. [ziy@nvidia.com: fix deadlock/crash] Link: https://lkml.kernel.org/r/23A7297E-6C84-4138-A9FE-3598234004E6@nvidia.com Link: https://lkml.kernel.org/r/20220425143118.2850746-4-zi.yan@sent.com Signed-off-by: Zi Yan Reported-by: kernel test robot Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 4 +- mm/internal.h | 6 ++ mm/memory_hotplug.c | 3 +- mm/page_alloc.c | 54 +++++++++--- mm/page_isolation.c | 193 +++++++++++++++++++++++++++++++++++++++-- 5 files changed, 242 insertions(+), 18 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index e14eddf6741a..5456b7be38ae 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page, */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype, int flags); + int migratetype, int flags, gfp_t gfp_flags); /* * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, */ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype); + int migratetype); /* * Test all pages in [start_pfn, end_pfn) are isolated or not. diff --git a/mm/internal.h b/mm/internal.h index ddd09245a6db..a770029beb08 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); +void split_free_page(struct page *free_page, + int order, unsigned long split_pfn_offset); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc, int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); + +int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end); #endif int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e99fd60548f5..945191708ef6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1837,7 +1837,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE); + MEMORY_OFFLINE | REPORT_FAILURE, + GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0756f046b644..0c7252ed14a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1094,6 +1094,43 @@ done_merging: page_reporting_notify_free(order); } +/** + * split_free_page() -- split a free page at split_pfn_offset + * @free_page: the original free page + * @order: the order of the page + * @split_pfn_offset: split offset within the page + * + * It is used when the free page crosses two pageblocks with different migratetypes + * at split_pfn_offset within the page. The split free page will be put into + * separate migratetype lists afterwards. Otherwise, the function achieves + * nothing. + */ +void split_free_page(struct page *free_page, + int order, unsigned long split_pfn_offset) +{ + struct zone *zone = page_zone(free_page); + unsigned long free_page_pfn = page_to_pfn(free_page); + unsigned long pfn; + unsigned long flags; + int free_page_order; + + spin_lock_irqsave(&zone->lock, flags); + del_page_from_free_list(free_page, zone, order); + for (pfn = free_page_pfn; + pfn < free_page_pfn + (1UL << order);) { + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); + + free_page_order = ffs(split_pfn_offset) - 1; + __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, + mt, FPI_NONE); + pfn += 1UL << free_page_order; + split_pfn_offset -= (1UL << free_page_order); + /* we have done the first part, now switch to second part */ + if (split_pfn_offset == 0) + split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); + } + spin_unlock_irqrestore(&zone->lock, flags); +} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@ -8951,7 +8988,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list) #endif /* [start, end) must belong to a single zone. */ -static int __alloc_contig_migrate_range(struct compact_control *cc, +int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ @@ -9034,7 +9071,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; - unsigned int order; + int order; int ret = 0; struct compact_control cc = { @@ -9053,14 +9090,11 @@ int alloc_contig_range(unsigned long start, unsigned long end, * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may * have different sizes, and due to the way page allocator - * work, we align the range to biggest of the two pages so - * that page allocator won't try to merge buddies from - * different pageblocks and change MIGRATE_ISOLATE to some - * other migration type. + * work, start_isolate_page_range() has special handlings for this. * * Once the pageblocks are marked as MIGRATE_ISOLATE, we * migrate the pages from an unaligned range (ie. pages that - * we are interested in). This will put all the pages in + * we are interested in). This will put all the pages in * range back to page allocator as MIGRATE_ISOLATE. * * When this is done, we take the pages in range from page @@ -9074,9 +9108,9 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, 0); + pfn_max_align_up(end), migratetype, 0, gfp_mask); if (ret) - return ret; + goto done; drain_all_pages(cc.zone); @@ -9096,7 +9130,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = 0; /* - * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * Pages from [start, end) are within a pageblock_nr_pages * aligned blocks that are marked as MIGRATE_ISOLATE. What's * more, all pages in [start, end) are free in page allocator. * What we are going to do is to allocate all pages from diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c2f7a8bb634d..8a0f16d2e4c3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ return -EBUSY; } -static void unset_migratetype_isolate(struct page *page, unsigned migratetype) +static void unset_migratetype_isolate(struct page *page, int migratetype) { struct zone *zone; unsigned long flags, nr_pages; @@ -279,6 +279,166 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) return NULL; } +/** + * isolate_single_pageblock() -- tries to isolate a pageblock that might be + * within a free or in-use page. + * @boundary_pfn: pageblock-aligned pfn that a page might cross + * @gfp_flags: GFP flags used for migrating pages + * @isolate_before: isolate the pageblock before the boundary_pfn + * + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one + * pageblock. When not all pageblocks within a page are isolated at the same + * time, free page accounting can go wrong. For example, in the case of + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks. + * [ MAX_ORDER-1 ] + * [ pageblock0 | pageblock1 ] + * When either pageblock is isolated, if it is a free page, the page is not + * split into separate migratetype lists, which is supposed to; if it is an + * in-use page and freed later, __free_one_page() does not split the free page + * either. The function handles this by splitting the free page or migrating + * the in-use page then splitting the free page. + */ +static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, + bool isolate_before) +{ + unsigned char saved_mt; + unsigned long start_pfn; + unsigned long isolate_pageblock; + unsigned long pfn; + struct zone *zone; + + VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); + + if (isolate_before) + isolate_pageblock = boundary_pfn - pageblock_nr_pages; + else + isolate_pageblock = boundary_pfn; + + /* + * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid + * only isolating a subset of pageblocks from a bigger than pageblock + * free or in-use page. Also make sure all to-be-isolated pageblocks + * are within the same zone. + */ + zone = page_zone(pfn_to_page(isolate_pageblock)); + start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES), + zone->zone_start_pfn); + + saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE); + + /* + * Bail out early when the to-be-isolated pageblock does not form + * a free or in-use page across boundary_pfn: + * + * 1. isolate before boundary_pfn: the page after is not online + * 2. isolate after boundary_pfn: the page before is not online + * + * This also ensures correctness. Without it, when isolate after + * boundary_pfn and [start_pfn, boundary_pfn) are not online, + * __first_valid_page() will return unexpected NULL in the for loop + * below. + */ + if (isolate_before) { + if (!pfn_to_online_page(boundary_pfn)) + return 0; + } else { + if (!pfn_to_online_page(boundary_pfn - 1)) + return 0; + } + + for (pfn = start_pfn; pfn < boundary_pfn;) { + struct page *page = __first_valid_page(pfn, boundary_pfn - pfn); + + VM_BUG_ON(!page); + pfn = page_to_pfn(page); + /* + * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any + * free pages in [start_pfn, boundary_pfn), its head page will + * always be in the range. + */ + if (PageBuddy(page)) { + int order = buddy_order(page); + + if (pfn + (1UL << order) > boundary_pfn) + split_free_page(page, order, boundary_pfn - pfn); + pfn += (1UL << order); + continue; + } + /* + * migrate compound pages then let the free page handling code + * above do the rest. If migration is not possible, just fail. + */ + if (PageCompound(page)) { + unsigned long nr_pages = compound_nr(page); + struct page *head = compound_head(page); + unsigned long head_pfn = page_to_pfn(head); + + if (head_pfn + nr_pages < boundary_pfn) { + pfn = head_pfn + nr_pages; + continue; + } +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* + * hugetlb, lru compound (THP), and movable compound pages + * can be migrated. Otherwise, fail the isolation. + */ + if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { + int order; + unsigned long outer_pfn; + int ret; + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(head_pfn)), + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .no_set_skip_hint = true, + .gfp_mask = gfp_flags, + .alloc_contig = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + ret = __alloc_contig_migrate_range(&cc, head_pfn, + head_pfn + nr_pages); + + if (ret) + goto failed; + /* + * reset pfn to the head of the free page, so + * that the free page handling code above can split + * the free page to the right migratetype list. + * + * head_pfn is not used here as a hugetlb page order + * can be bigger than MAX_ORDER-1, but after it is + * freed, the free page order is not. Use pfn within + * the range to find the head of the free page. + */ + order = 0; + outer_pfn = pfn; + while (!PageBuddy(pfn_to_page(outer_pfn))) { + if (++order >= MAX_ORDER) { + outer_pfn = pfn; + break; + } + outer_pfn &= ~0UL << order; + } + pfn = outer_pfn; + continue; + } else +#endif + goto failed; + } + + pfn++; + } + return 0; +failed: + /* restore the original migratetype */ + set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt); + return -EBUSY; +} + /** * start_isolate_page_range() - make page-allocation-type of range of pages to * be MIGRATE_ISOLATE. @@ -293,6 +453,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range + * @gfp_flags: GFP flags used for migrating pages that sit across the + * range boundaries. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -301,6 +463,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * pages in the range finally, the caller have to free all pages in the range. * test_page_isolated() can be used for test it. * + * The function first tries to isolate the pageblocks at the beginning and end + * of the range, since there might be pages across the range boundaries. + * Afterwards, it isolates the rest of the range. + * * There is no high level synchronization mechanism that prevents two threads * from trying to isolate overlapping ranges. If this happens, one thread * will notice pageblocks in the overlapping range already set to isolate. @@ -321,21 +487,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype, int flags) + int migratetype, int flags, gfp_t gfp_flags) { unsigned long pfn; struct page *page; + int ret; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); - for (pfn = start_pfn; - pfn < end_pfn; + /* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */ + ret = isolate_single_pageblock(start_pfn, gfp_flags, false); + if (ret) + return ret; + + /* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */ + ret = isolate_single_pageblock(end_pfn, gfp_flags, true); + if (ret) { + unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype); + return ret; + } + + /* skip isolated pageblocks at the beginning and end */ + for (pfn = start_pfn + pageblock_nr_pages; + pfn < end_pfn - pageblock_nr_pages; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (page && set_migratetype_isolate(page, migratetype, flags, start_pfn, end_pfn)) { undo_isolate_page_range(start_pfn, pfn, migratetype); + unset_migratetype_isolate( + pfn_to_page(end_pfn - pageblock_nr_pages), + migratetype); return -EBUSY; } } @@ -346,7 +529,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Make isolated pages available again. */ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype) + int migratetype) { unsigned long pfn; struct page *page; -- cgit v1.2.3 From 6e263fff1de48fcd97b680b54cd8d1695fc3c776 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm: page_isolation: enable arbitrary range page isolation. Now start_isolate_page_range() is ready to handle arbitrary range isolation, so move the alignment check/adjustment into the function body. Do the same for its counterpart undo_isolate_page_range(). alloc_contig_range(), its caller, can pass an arbitrary range instead of a MAX_ORDER_NR_PAGES aligned one. Link: https://lkml.kernel.org/r/20220425143118.2850746-5-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: kernel test robot Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 ++-------------- mm/page_isolation.c | 33 ++++++++++++++++----------------- 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c7252ed14a0..3d7de8a96143 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8956,16 +8956,6 @@ void *__init alloc_large_system_hash(const char *tablename, } #ifdef CONFIG_CONTIG_ALLOC -static unsigned long pfn_max_align_down(unsigned long pfn) -{ - return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES); -} - -static unsigned long pfn_max_align_up(unsigned long pfn) -{ - return ALIGN(pfn, MAX_ORDER_NR_PAGES); -} - #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* Usage: See admin-guide/dynamic-debug-howto.rst */ @@ -9107,8 +9097,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, 0, gfp_mask); + ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); if (ret) goto done; @@ -9189,8 +9178,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); done: - undo_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype); + undo_isolate_page_range(start, end, migratetype); return ret; } EXPORT_SYMBOL(alloc_contig_range); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 8a0f16d2e4c3..b3f074d1682e 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -444,7 +444,6 @@ failed: * be MIGRATE_ISOLATE. * @start_pfn: The lower PFN of the range to be isolated. * @end_pfn: The upper PFN of the range to be isolated. - * start_pfn/end_pfn must be aligned to pageblock_order. * @migratetype: Migrate type to set in error recovery. * @flags: The following flags are allowed (they can be combined in * a bit mask) @@ -491,33 +490,33 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; + /* isolation is done at page block granularity */ + unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; - BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); - BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); - - /* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(start_pfn, gfp_flags, false); + /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ + ret = isolate_single_pageblock(isolate_start, gfp_flags, false); if (ret) return ret; - /* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */ - ret = isolate_single_pageblock(end_pfn, gfp_flags, true); + /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ + ret = isolate_single_pageblock(isolate_end, gfp_flags, true); if (ret) { - unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype); + unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; } /* skip isolated pageblocks at the beginning and end */ - for (pfn = start_pfn + pageblock_nr_pages; - pfn < end_pfn - pageblock_nr_pages; + for (pfn = isolate_start + pageblock_nr_pages; + pfn < isolate_end - pageblock_nr_pages; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (page && set_migratetype_isolate(page, migratetype, flags, start_pfn, end_pfn)) { - undo_isolate_page_range(start_pfn, pfn, migratetype); + undo_isolate_page_range(isolate_start, pfn, migratetype); unset_migratetype_isolate( - pfn_to_page(end_pfn - pageblock_nr_pages), + pfn_to_page(isolate_end - pageblock_nr_pages), migratetype); return -EBUSY; } @@ -533,12 +532,12 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; + unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); - BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); - BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); - for (pfn = start_pfn; - pfn < end_pfn; + for (pfn = isolate_start; + pfn < isolate_end; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (!page || !is_migrate_isolate_page(page)) -- cgit v1.2.3 From 11ac3e87ce09c27f4587a8c4fe0829d814021a82 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm: cma: use pageblock_order as the single alignment Now alloc_contig_range() works at pageblock granularity. Change CMA allocation, which uses alloc_contig_range(), to use pageblock_nr_pages alignment. Link: https://lkml.kernel.org/r/20220425143118.2850746-6-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: kernel test robot Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/cma.h | 4 ++-- include/linux/mmzone.h | 5 +---- mm/page_alloc.c | 4 ++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index a6f637342740..63873b93deaa 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -17,11 +17,11 @@ #define CMA_MAX_NAME 64 /* - * TODO: once the buddy -- especially pageblock merging and alloc_contig_range() + * the buddy -- especially pageblock merging and alloc_contig_range() * -- can deal with only some pageblocks of a higher-order page being * MIGRATE_CMA, we can use pageblock_nr_pages. */ -#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES +#define CMA_MIN_ALIGNMENT_PAGES pageblock_nr_pages #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) struct cma; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 46ffab808f03..aab70355d64f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -54,10 +54,7 @@ enum migratetype { * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by - * __free_pageblock_cma() function. What is important though - * is that a range of pageblocks must be aligned to - * MAX_ORDER_NR_PAGES should biggest page be bigger than - * a single pageblock. + * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3d7de8a96143..732e01e9d0e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9046,8 +9046,8 @@ int __alloc_contig_migrate_range(struct compact_control *cc, * be either of the two. * @gfp_mask: GFP mask to use during compaction * - * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned. The PFN range must belong to a single zone. + * The PFN range does not have to be pageblock aligned. The PFN range must + * belong to a single zone. * * The first thing this routine does is attempt to MIGRATE_ISOLATE all * pageblocks in the range. Once isolated, the pageblocks should not -- cgit v1.2.3 From 448b8ec3bf115b38c82c31393a1debab0e0692c5 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: drivers: virtio_mem: use pageblock size as the minimum virtio_mem size. alloc_contig_range() now only needs to be aligned to pageblock_nr_pages, drop virtio_mem size requirement that it needs to be MAX_ORDER_NR_PAGES. Link: https://lkml.kernel.org/r/20220425143118.2850746-7-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: kernel test robot Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/virtio/virtio_mem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index e7d6b679596d..e07486f01999 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2476,10 +2476,10 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm) VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); /* - * TODO: once alloc_contig_range() works reliably with pageblock - * granularity on ZONE_NORMAL, use pageblock_nr_pages instead. + * alloc_contig_range() works reliably with pageblock + * granularity on ZONE_NORMAL, use pageblock_nr_pages. */ - sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES; + sb_size = PAGE_SIZE * pageblock_nr_pages; sb_size = max_t(uint64_t, vm->device_block_size, sb_size); if (sb_size < memory_block_size_bytes() && !force_bbm) { -- cgit v1.2.3 From d8ff6fde8e88d801b62328883680c202510ed518 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm/vmscan: take min_slab_pages into account when try to call shrink_node Since commit 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()"), slab reclaim and lru page reclaim are done together in the shrink_node. So we should take min_slab_pages into account when try to call shrink_node. Link: https://lkml.kernel.org/r/20220425112118.20924-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Huang Ying Cc: Christoph Hellwig Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a9761b04564c..5ac0a71dc0df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4713,7 +4713,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(p, &sc.reclaim_state); - if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. -- cgit v1.2.3 From 0d6ea3ac94ca77c5273b064524ac5079312052a0 Mon Sep 17 00:00:00 2001 From: Jagdish Gediya Date: Thu, 12 May 2022 20:22:59 -0700 Subject: lib/kstrtox.c: add "false"/"true" support to kstrtobool() At many places in kernel, It is necessary to convert sysfs input to corresponding bool value e.g. "false" or "0" need to be converted to bool false, "true" or "1" need to be converted to bool true, places where such conversion is needed currently check the input string manually, kstrtobool() can be utilized at such places but currently it doesn't have support to accept "false"/"true". Add support to accept "false"/"true" as valid string in kstrtobool(). [akpm@linux-foundation.org: undo s/iff/if/, per Matthew] Link: https://lkml.kernel.org/r/20220426180203.70782-1-jvgediya@linux.ibm.com Signed-off-by: Jagdish Gediya Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Andy Shevchenko Cc: "Huang, Ying" Cc: Dave Hansen Cc: Jonathan Cameron Cc: Alexey Dobriyan Cc: Richard Fitzgerald Cc: Petr Mladek Signed-off-by: Andrew Morton --- lib/kstrtox.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/kstrtox.c b/lib/kstrtox.c index 886510d248e5..08c14019841a 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -340,7 +340,7 @@ EXPORT_SYMBOL(kstrtos8); * @s: input string * @res: result * - * This routine returns 0 iff the first character is one of 'Yy1Nn0', or + * This routine returns 0 iff the first character is one of 'YyTt1NnFf0', or * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value * pointed to by res is updated upon finding a match. */ @@ -353,11 +353,15 @@ int kstrtobool(const char *s, bool *res) switch (s[0]) { case 'y': case 'Y': + case 't': + case 'T': case '1': *res = true; return 0; case 'n': case 'N': + case 'f': + case 'F': case '0': *res = false; return 0; -- cgit v1.2.3 From 717aeab42943efa7cfa876b3b687c6ff36eae867 Mon Sep 17 00:00:00 2001 From: Jagdish Gediya Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm: convert sysfs input to bool using kstrtobool() Sysfs input conversion to corrosponding bool value e.g. "false" or "0" to false, "true" or "1" to true are currently handled through strncmp at multiple places. Use kstrtobool() to convert sysfs input to bool value. [akpm@linux-foundation.org: propagate kstrtobool() return value, per Andy] Link: https://lkml.kernel.org/r/20220426180203.70782-2-jvgediya@linux.ibm.com Signed-off-by: Jagdish Gediya Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Andy Shevchenko Cc: Alexey Dobriyan Cc: Dave Hansen Cc: "Huang, Ying" Cc: Jonathan Cameron Cc: Petr Mladek Cc: Richard Fitzgerald Signed-off-by: Andrew Morton --- mm/migrate.c | 11 +++++------ mm/swap_state.c | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index b2678279eb43..ab46ad93af9f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2523,12 +2523,11 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - numa_demotion_enabled = true; - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - numa_demotion_enabled = false; - else - return -EINVAL; + ssize_t ret; + + ret = kstrtobool(buf, &numa_demotion_enabled); + if (ret) + return ret; return count; } diff --git a/mm/swap_state.c b/mm/swap_state.c index d41746a572a2..27c4e28f795f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -874,12 +874,11 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - enable_vma_readahead = true; - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - enable_vma_readahead = false; - else - return -EINVAL; + ssize_t ret; + + ret = kstrtobool(buf, &enable_vma_readahead); + if (ret) + return ret; return count; } -- cgit v1.2.3 From 048f6e1a427ee9cddf62f9b3766372c69846fa4f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm/vmscan: not necessary to re-init the list for each iteration node_page_list is defined with LIST_HEAD and be cleaned until list_empty. So it is not necessary to re-init it again. [akpm@linux-foundation.org: remove unneeded braces] Link: https://lkml.kernel.org/r/20220426021743.21007-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ac0a71dc0df..726f5ce366da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2553,10 +2553,8 @@ unsigned long reclaim_pages(struct list_head *page_list) while (!list_empty(page_list)) { page = lru_to_page(page_list); - if (nid == NUMA_NO_NODE) { + if (nid == NUMA_NO_NODE) nid = page_to_nid(page); - INIT_LIST_HEAD(&node_page_list); - } if (nid == page_to_nid(page)) { ClearPageActive(page); -- cgit v1.2.3 From 32a331a72f3eec30f65fd929aeb4dfc514eca28f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm/vmscan: add a comment about MADV_FREE pages check in folio_check_dirty_writeback Patch series "A few cleanup and fixup patches for vmscan This series contains a few patches to remove obsolete comment, introduce helper to remove duplicated code and so no. Also we take all base pages of THP into account in rare race condition. More details can be found in the respective changelogs. This patch (of 6): The MADV_FREE pages check in folio_check_dirty_writeback is a bit hard to follow. Add a comment to make the code clear. Link: https://lkml.kernel.org/r/20220425111232.23182-2-linmiaohe@huawei.com Suggested-by: Huang, Ying Signed-off-by: Miaohe Lin Reviewed-by: Christoph Hellwig Reviewed-by: Oscar Salvador Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 726f5ce366da..4ea23807d23f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1435,7 +1435,10 @@ static void folio_check_dirty_writeback(struct folio *folio, /* * Anonymous pages are not handled by flushers and must be written - * from reclaim context. Do not stall reclaim based on them + * from reclaim context. Do not stall reclaim based on them. + * MADV_FREE anonymous pages are put into inactive file list too. + * They could be mistakenly treated as file lru. So further anon + * test is needed. */ if (!folio_is_file_lru(folio) || (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { -- cgit v1.2.3 From 1fe47c0beb2df2739b07b8e051425b6400abce5b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm/vmscan: introduce helper function reclaim_page_list() Introduce helper function reclaim_page_list() to eliminate the duplicated code of doing shrink_page_list() and putback_lru_page. Also we can separate node reclaim from node page list operation this way. No functional change intended. Link: https://lkml.kernel.org/r/20220425111232.23182-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4ea23807d23f..68d4004e0397 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2536,14 +2536,12 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -unsigned long reclaim_pages(struct list_head *page_list) +static unsigned int reclaim_page_list(struct list_head *page_list, + struct pglist_data *pgdat) { - int nid = NUMA_NO_NODE; - unsigned int nr_reclaimed = 0; - LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; - struct page *page; - unsigned int noreclaim_flag; + unsigned int nr_reclaimed; + struct folio *folio; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_writepage = 1, @@ -2552,6 +2550,24 @@ unsigned long reclaim_pages(struct list_head *page_list) .no_demotion = 1, }; + nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(page_list)) { + folio = lru_to_folio(page_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + + return nr_reclaimed; +} + +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = NUMA_NO_NODE; + unsigned int nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct page *page; + unsigned int noreclaim_flag; + noreclaim_flag = memalloc_noreclaim_save(); while (!list_empty(page_list)) { @@ -2565,28 +2581,12 @@ unsigned long reclaim_pages(struct list_head *page_list) continue; } - nr_reclaimed += shrink_page_list(&node_page_list, - NODE_DATA(nid), - &sc, &dummy_stat, false); - while (!list_empty(&node_page_list)) { - page = lru_to_page(&node_page_list); - list_del(&page->lru); - putback_lru_page(page); - } - + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); nid = NUMA_NO_NODE; } - if (!list_empty(&node_page_list)) { - nr_reclaimed += shrink_page_list(&node_page_list, - NODE_DATA(nid), - &sc, &dummy_stat, false); - while (!list_empty(&node_page_list)) { - page = lru_to_page(&node_page_list); - list_del(&page->lru); - putback_lru_page(page); - } - } + if (!list_empty(&node_page_list)) + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); -- cgit v1.2.3 From 9aafcffc18785fcdd9295640eb2ed927960b31a1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: take all base pages of THP into account when race with speculative reference If the page has buffers, shrink_page_list will try to free the buffer mappings associated with the page and try to free the page as well. In the rare race with speculative reference, the page will be freed shortly by speculative reference. But nr_reclaimed is not incremented correctly when we come across the THP. We need to account all the base pages in this case. Link: https://lkml.kernel.org/r/20220425111232.23182-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 68d4004e0397..61688f4a9d1e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1884,7 +1884,7 @@ retry: * increment nr_reclaimed here (and * leave it off the LRU). */ - nr_reclaimed++; + nr_reclaimed += nr_pages; continue; } } -- cgit v1.2.3 From 4355e4b265ccb55dd6625b82f0e2016f42f2956c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: remove obsolete comment in kswapd_run Since commit 6b700b5b3c59 ("mm/vmscan.c: remove cpu online notification for now"), cpu online notification is removed. So kswapd won't move to proper cpus if cpus are hot-added. Remove this obsolete comment. Link: https://lkml.kernel.org/r/20220425111232.23182-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 61688f4a9d1e..5c20efe0c82a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4567,7 +4567,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) /* * This kswapd start function will be called by init and node-hot-add. - * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ void kswapd_run(int nid) { -- cgit v1.2.3 From f19a27e399c4354b91d608dd77f33877f613224a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: use helper folio_is_file_lru() Use helper folio_is_file_lru() to check whether folio is file lru. Minor readability improvement. [linmiaohe@huawei.com: use folio_is_file_lru()] Link: https://lkml.kernel.org/r/20220428105802.21389-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220425111232.23182-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c20efe0c82a..9ff3cc2a941a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1414,14 +1414,14 @@ static enum page_references folio_check_references(struct folio *folio, /* * Activate file-backed executable folios after first usage. */ - if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio)) + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ - if (referenced_folio && !folio_test_swapbacked(folio)) + if (referenced_folio && folio_is_file_lru(folio)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; -- cgit v1.2.3 From 1ae65e2749b0a3d236fc17d0eca5ea5f6f2c0032 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: filter empty page_list at the beginning node_page_list would always be !empty on finishing the loop, except page_list is empty. Let's handle empty page_list before doing any real work including touching PF_MEMALLOC flag. Link: https://lkml.kernel.org/r/20220429014426.29223-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ff3cc2a941a..43883ff89c1a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2568,9 +2568,12 @@ unsigned long reclaim_pages(struct list_head *page_list) struct page *page; unsigned int noreclaim_flag; + if (list_empty(page_list)) + return nr_reclaimed; + noreclaim_flag = memalloc_noreclaim_save(); - while (!list_empty(page_list)) { + do { page = lru_to_page(page_list); if (nid == NUMA_NO_NODE) nid = page_to_nid(page); @@ -2583,10 +2586,9 @@ unsigned long reclaim_pages(struct list_head *page_list) nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); nid = NUMA_NO_NODE; - } + } while (!list_empty(page_list)); - if (!list_empty(&node_page_list)) - nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); -- cgit v1.2.3 From ed657e5568c5fc877c517b654d65fbdaeb628539 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: don't use NUMA_NO_NODE as indicator of page on different node Now we are sure there is at least one page on page_list, so it is safe to get the nid of it. This means it is not necessary to use NUMA_NO_NODE as an indicator for the beginning of iteration or a page on different node. Link: https://lkml.kernel.org/r/20220429014426.29223-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 43883ff89c1a..9cfce1449af4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2562,7 +2562,7 @@ static unsigned int reclaim_page_list(struct list_head *page_list, unsigned long reclaim_pages(struct list_head *page_list) { - int nid = NUMA_NO_NODE; + int nid; unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct page *page; @@ -2573,10 +2573,9 @@ unsigned long reclaim_pages(struct list_head *page_list) noreclaim_flag = memalloc_noreclaim_save(); + nid = page_to_nid(lru_to_page(page_list)); do { page = lru_to_page(page_list); - if (nid == NUMA_NO_NODE) - nid = page_to_nid(page); if (nid == page_to_nid(page)) { ClearPageActive(page); @@ -2585,7 +2584,7 @@ unsigned long reclaim_pages(struct list_head *page_list) } nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); - nid = NUMA_NO_NODE; + nid = page_to_nid(lru_to_page(page_list)); } while (!list_empty(page_list)); nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); -- cgit v1.2.3 From 54943a1a4d2a3fed23d31e96a91aa9d18ea74500 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/shmem: remove duplicate include in memory.c Fix following checkincludes.pl warning: mm/memory.c: linux/mm_inline.h is included more than once. The include is in line 44. Remove the duplicated here. Link: https://lkml.kernel.org/r/20220427064717.803019-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Andrew Morton --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index f4161fb07ffa..f4b2c05707b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -74,7 +74,6 @@ #include #include #include -#include #include -- cgit v1.2.3 From dfe98499ef288ac730662a70110bcadaa67c0ee0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: shmem: convert shmem_alloc_hugepage() to use vma_alloc_folio() Patch series "Folio patches for 5.19", v2. This patch (of 26): For now, return the head page of the folio, but remove use of the old alloc_pages_vma() API. Link: https://lkml.kernel.org/r/20220504182857.4013401-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220504182857.4013401-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/shmem.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4a4a2e628289..7994421588a3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1528,7 +1528,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct vm_area_struct pvma; struct address_space *mapping = info->vfs_inode.i_mapping; pgoff_t hindex; - struct page *page; + struct folio *folio; hindex = round_down(index, HPAGE_PMD_NR); if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, @@ -1536,13 +1536,11 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); - if (page) - prep_transhuge_page(page); - else + if (!folio) count_vm_event(THP_FILE_FALLBACK); - return page; + return &folio->page; } static struct page *shmem_alloc_page(gfp_t gfp, -- cgit v1.2.3 From cb196ee1ef390bea17eb9dc81039abad9b3627ca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: mm/huge_memory: convert do_huge_pmd_anonymous_page() to use vma_alloc_folio() Remove the use of this old API, eliminating a call to prep_transhuge_page(). Link: https://lkml.kernel.org/r/20220504182857.4013401-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/huge_memory.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6f37f77eb48c..c37c870c526f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -726,7 +726,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; - struct page *page; + struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (!transhuge_vma_suitable(vma, haddr)) @@ -775,13 +775,12 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = vma_thp_gfp_mask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); - if (unlikely(!page)) { + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(vmf, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From f9c668d281aa20e38c9bda3b7b0adeb8891aa15e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: alpha: fix alloc_zeroed_user_highpage_movable() Due to a typo, the final argument to alloc_page_vma() didn't refer to a real variable. This only affected CONFIG_NUMA, which was marked BROKEN in 2006 and removed from alpha in 2021. Found due to a refactoring patch. Link: https://lkml.kernel.org/r/20220504182857.4013401-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: kernel test robot Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 18f48a6f2ff6..8f3f5eecba28 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -18,7 +18,7 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) #define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vmaddr) + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE extern void copy_page(void * _to, void * _from); -- cgit v1.2.3 From adf88aa8ea7ff143825a2a8a7193f92e0e6fc79b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: mm: remove alloc_pages_vma() All callers have now been converted to use vma_alloc_folio(), so convert the body of alloc_pages_vma() to allocate folios instead. Link: https://lkml.kernel.org/r/20220504182857.4013401-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/gfp.h | 18 +++++++----------- mm/mempolicy.c | 51 +++++++++++++++++++++++++-------------------------- 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3e3d36fc2109..2a08a3c4ba95 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -613,13 +613,8 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); -struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - struct vm_area_struct *vma, unsigned long addr, - bool hugepage); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -629,16 +624,17 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, hugepage) \ - alloc_pages(gfp_mask, order) #define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ folio_alloc(gfp, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -#define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, false) +static inline struct page *alloc_page_vma(gfp_t gfp, + struct vm_area_struct *vma, unsigned long addr) +{ + struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); + + return &folio->page; +} extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3934476fb708..ed055e34d20c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2136,44 +2136,55 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, } /** - * alloc_pages_vma - Allocate a page for a VMA. + * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. - * @order: Order of the GFP allocation. + * @order: Order of the folio. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual address of the allocation. Must be inside @vma. * @hugepage: For hugepages try only the preferred node if possible. * - * Allocate a page for a specific address in @vma, using the appropriate + * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for pages that will be mapped into user space. + * used for all allocations for folios that will be mapped into user space. * - * Return: The page on success or NULL if allocation fails. + * Return: The folio on success or NULL if allocation fails. */ -struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; int node = numa_node_id(); - struct page *page; + struct folio *folio; int preferred_nid; nodemask_t *nmask; pol = get_vma_policy(vma, addr); if (pol->mode == MPOL_INTERLEAVE) { + struct page *page; unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); + gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); + if (page && order > 1) + prep_transhuge_page(page); + folio = (struct folio *)page; goto out; } if (pol->mode == MPOL_PREFERRED_MANY) { + struct page *page; + node = policy_node(gfp, pol, node); + gfp |= __GFP_COMP; page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); + if (page && order > 1) + prep_transhuge_page(page); + folio = (struct folio *)page; goto out; } @@ -2200,8 +2211,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE | __GFP_NORETRY, order); + folio = __folio_alloc_node(gfp | __GFP_THISNODE | + __GFP_NORETRY, order, hpage_node); /* * If hugepage allocations are configured to always @@ -2209,8 +2220,9 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!page && (gfp & __GFP_DIRECT_RECLAIM)) - page = __alloc_pages(gfp, order, hpage_node, nmask); + if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) + folio = __folio_alloc(gfp, order, hpage_node, + nmask); goto out; } @@ -2218,25 +2230,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); - page = __alloc_pages(gfp, order, preferred_nid, nmask); + folio = __folio_alloc(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: - return page; -} -EXPORT_SYMBOL(alloc_pages_vma); - -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) -{ - struct folio *folio; - - folio = (struct folio *)alloc_pages_vma(gfp, order, vma, addr, - hugepage); - if (folio && order > 1) - prep_transhuge_page(&folio->page); - return folio; } +EXPORT_SYMBOL(vma_alloc_folio); /** * alloc_pages - Allocate pages. -- cgit v1.2.3 From 1bee2c1677bcb57dadd374cafb59be86ad1a1a82 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: vmscan: use folio_mapped() in shrink_page_list() Remove some legacy function calls. Link: https://lkml.kernel.org/r/20220504182857.4013401-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9cfce1449af4..e25bca89c3c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1572,7 +1572,7 @@ retry: if (unlikely(!page_evictable(page))) goto activate_locked; - if (!sc->may_unmap && page_mapped(page)) + if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; /* @@ -1761,21 +1761,21 @@ retry: } /* - * The page is mapped into the page tables of one or more + * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page)) { + if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; - bool was_swapbacked = PageSwapBacked(page); + bool was_swapbacked = folio_test_swapbacked(folio); - if (PageTransHuge(page) && - thp_order(page) >= HPAGE_PMD_ORDER) + if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; try_to_unmap(folio, flags); - if (page_mapped(page)) { + if (folio_mapped(folio)) { stat->nr_unmap_fail += nr_pages; - if (!was_swapbacked && PageSwapBacked(page)) + if (!was_swapbacked && + folio_test_swapbacked(folio)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } -- cgit v1.2.3 From d33e4e1412c8b618f5f2f251ab9ddcfdf9f4adf3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: vmscan: convert the writeback handling in shrink_page_list() to folios Slightly more efficient due to fewer calls to compound_head(). Link: https://lkml.kernel.org/r/20220504182857.4013401-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 78 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e25bca89c3c1..2837e7e3677c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1598,40 +1598,42 @@ retry: stat->nr_congested += nr_pages; /* - * If a page at the tail of the LRU is under writeback, there + * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * - * 1) If reclaim is encountering an excessive number of pages - * under writeback and this page is both under writeback and - * PageReclaim then it indicates that pages are being queued - * for IO but are being recycled through the LRU before the - * IO can complete. Waiting on the page itself risks an - * indefinite stall if it is impossible to writeback the - * page due to IO error or disconnected storage so instead - * note that the LRU is being scanned too quickly and the - * caller can stall after page list has been processed. + * 1) If reclaim is encountering an excessive number of folios + * under writeback and this folio is both under + * writeback and has the reclaim flag set then it + * indicates that folios are being queued for I/O but + * are being recycled through the LRU before the I/O + * can complete. Waiting on the folio itself risks an + * indefinite stall if it is impossible to writeback + * the folio due to I/O error or disconnected storage + * so instead note that the LRU is being scanned too + * quickly and the caller can stall after the folio + * list has been processed. * - * 2) Global or new memcg reclaim encounters a page that is + * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the page for immediate + * not to fs). In this case mark the folio for immediate * reclaim and continue scanning. * * Require may_enter_fs() because we would wait on fs, which - * may not have submitted IO yet. And the loop driver might - * enter reclaim, and deadlock if it waits on a page for + * may not have submitted I/O yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * 3) Legacy memcg encounters a page that is already marked - * PageReclaim. memcg does not have any dirty pages + * 3) Legacy memcg encounters a folio that already has the + * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many - * pages are in writeback and there is nothing else to + * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * - * In cases 1) and 2) we activate the pages to get them out of - * the way while we continue scanning for clean pages on the + * In cases 1) and 2) we activate the folios to get them out of + * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. @@ -1639,38 +1641,42 @@ retry: * memory pressure on the cache working set any longer than it * takes to write them to disk. */ - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { /* Case 1 above */ if (current_is_kswapd() && - PageReclaim(page) && + folio_test_reclaim(folio) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || - !PageReclaim(page) || !may_enter_fs(page, sc->gfp_mask)) { + !folio_test_reclaim(folio) || + !may_enter_fs(page, sc->gfp_mask)) { /* - * This is slightly racy - end_page_writeback() - * might have just cleared PageReclaim, then - * setting PageReclaim here end up interpreted - * as PageReadahead - but that does not matter - * enough to care. What we do want is for this - * page to have PageReclaim set next time memcg - * reclaim reaches the tests above, so it will - * then wait_on_page_writeback() to avoid OOM; - * and it's also appropriate in global reclaim. + * This is slightly racy - + * folio_end_writeback() might have just + * cleared the reclaim flag, then setting + * reclaim here ends up interpreted as + * the readahead flag - but that does + * not matter enough to care. What we + * do want is for this folio to have + * the reclaim flag set next time memcg + * reclaim reaches the tests above, so + * it will then folio_wait_writeback() + * to avoid OOM; and it's also appropriate + * in global reclaim. */ - SetPageReclaim(page); + folio_set_reclaim(folio); stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ } else { - unlock_page(page); - wait_on_page_writeback(page); - /* then go back and try same page again */ - list_add_tail(&page->lru, page_list); + folio_unlock(folio); + folio_wait_writeback(folio); + /* then go back and try same folio again */ + list_add_tail(&folio->lru, page_list); continue; } } -- cgit v1.2.3 From e2e3fdc7d4afdb8e7ba981eba7827993f2d390a8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: swap: turn get_swap_page() into folio_alloc_swap() This removes an assumption that a large folio is HPAGE_PMD_NR pages in size. Link: https://lkml.kernel.org/r/20220504182857.4013401-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/swap.h | 13 +++++++------ mm/memcontrol.c | 16 ++++++++-------- mm/shmem.c | 3 ++- mm/swap_slots.c | 14 +++++++------- mm/swap_state.c | 3 ++- mm/swapfile.c | 17 +++++++++-------- 6 files changed, 35 insertions(+), 31 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 5553189d0215..252c23a6667f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -470,7 +470,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(struct page *page); +swp_entry_t folio_alloc_swap(struct folio *folio); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); @@ -583,7 +583,7 @@ static inline int try_to_free_swap(struct page *page) return 0; } -static inline swp_entry_t get_swap_page(struct page *page) +static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; entry.val = 0; @@ -643,12 +643,13 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) #ifdef CONFIG_MEMCG_SWAP void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); -extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); +static inline int mem_cgroup_try_charge_swap(struct folio *folio, + swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_try_charge_swap(page, entry); + return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); @@ -666,7 +667,7 @@ static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { } -static inline int mem_cgroup_try_charge_swap(struct page *page, +static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e74ca98c862..e1b5823ac060 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7162,17 +7162,17 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) } /** - * __mem_cgroup_try_charge_swap - try charging swap space for a page - * @page: page being added to swap + * __mem_cgroup_try_charge_swap - try charging swap space for a folio + * @folio: folio being added to swap * @entry: swap entry to charge * - * Try to charge @page's memcg for the swap space at @entry. + * Try to charge @folio's memcg for the swap space at @entry. * * Returns 0 on success, -ENOMEM on failure. */ -int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { - unsigned int nr_pages = thp_nr_pages(page); + unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; @@ -7180,9 +7180,9 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; - memcg = page_memcg(page); + memcg = folio_memcg(folio); - VM_WARN_ON_ONCE_PAGE(!memcg, page); + VM_WARN_ON_ONCE_FOLIO(!memcg, folio); if (!memcg) return 0; @@ -7205,7 +7205,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (nr_pages > 1) mem_cgroup_id_get_many(memcg, nr_pages - 1); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); - VM_BUG_ON_PAGE(oldid, page); + VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 7994421588a3..7cfe76347f63 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1313,6 +1313,7 @@ int shmem_unuse(unsigned int type) */ static int shmem_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct shmem_inode_info *info; struct address_space *mapping; struct inode *inode; @@ -1386,7 +1387,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) SetPageUptodate(page); } - swap = get_swap_page(page); + swap = folio_alloc_swap(folio); if (!swap.val) goto redirty; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 2b5531840583..0218ec1cd24c 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -117,7 +117,7 @@ static int alloc_swap_slot_cache(unsigned int cpu) /* * Do allocation outside swap_slots_cache_mutex - * as kvzalloc could trigger reclaim and get_swap_page, + * as kvzalloc could trigger reclaim and folio_alloc_swap, * which can lock swap_slots_cache_mutex. */ slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), @@ -213,7 +213,7 @@ static void __drain_swap_slots_cache(unsigned int type) * this function can be invoked in the cpu * hot plug path: * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback - * -> memory allocation -> direct reclaim -> get_swap_page + * -> memory allocation -> direct reclaim -> folio_alloc_swap * -> drain_swap_slots_cache * * Hence the loop over current online cpu below could miss cpu that @@ -301,16 +301,16 @@ direct_free: return 0; } -swp_entry_t get_swap_page(struct page *page) +swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; struct swap_slots_cache *cache; entry.val = 0; - if (PageTransHuge(page)) { + if (folio_test_large(folio)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, HPAGE_PMD_NR); + get_swap_pages(1, &entry, folio_nr_pages(folio)); goto out; } @@ -344,8 +344,8 @@ repeat: get_swap_pages(1, &entry, 1); out: - if (mem_cgroup_try_charge_swap(page, entry)) { - put_swap_page(page, entry); + if (mem_cgroup_try_charge_swap(folio, entry)) { + put_swap_page(&folio->page, entry); entry.val = 0; } return entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index 27c4e28f795f..2dc0d02f8d79 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -184,13 +184,14 @@ void __delete_from_swap_cache(struct page *page, */ int add_to_swap(struct page *page) { + struct folio *folio = page_folio(page); swp_entry_t entry; int err; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - entry = get_swap_page(page); + entry = folio_alloc_swap(folio); if (!entry.val) return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index 9398e915b36b..bfa41bf5e190 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -77,9 +77,9 @@ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. - * This is used by get_swap_page() instead of swap_active_head + * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, - * but get_swap_page() doesn't need to look at full ones. + * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock @@ -2109,11 +2109,12 @@ retry: * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * - * Limit the number of retries? No: when mmget_not_zero() above fails, - * that mm is likely to be freeing swap from exit_mmap(), which proceeds - * at its own independent pace; and even shmem_writepage() could have - * been preempted after get_swap_page(), temporarily hiding that swap. - * It's easy and robust (though cpu-intensive) just to keep retrying. + * Limit the number of retries? No: when mmget_not_zero() + * above fails, that mm is likely to be freeing swap from + * exit_mmap(), which proceeds at its own independent pace; + * and even shmem_writepage() could have been preempted after + * folio_alloc_swap(), temporarily hiding that swap. It's easy + * and robust (though cpu-intensive) just to keep retrying. */ if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) @@ -2327,7 +2328,7 @@ static void _enable_swap_info(struct swap_info_struct *p) * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. - * swap_avail_head needs to be priority ordered for get_swap_page(), + * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ -- cgit v1.2.3 From 09c02e56327bdaf9cdbb2742a35fb8c6a6f9a6c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: swap: convert add_to_swap() to take a folio The only caller already has a folio available, so this saves a conversion. Also convert the return type to boolean. Link: https://lkml.kernel.org/r/20220504182857.4013401-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/swap.h | 6 +++--- mm/swap_state.c | 47 +++++++++++++++++++++++++---------------------- mm/vmscan.c | 6 +++--- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index a6da8f612904..0193797b0c92 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -32,7 +32,7 @@ extern struct address_space *swapper_spaces[]; >> SWAP_ADDRESS_SPACE_SHIFT]) void show_swap_cache_info(void); -int add_to_swap(struct page *page); +bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp); @@ -119,9 +119,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) return find_get_page(mapping, index); } -static inline int add_to_swap(struct page *page) +static inline bool add_to_swap(struct folio *folio) { - return 0; + return false; } static inline void *get_shadow_from_swap_cache(swp_entry_t entry) diff --git a/mm/swap_state.c b/mm/swap_state.c index 2dc0d02f8d79..416aaaa8a7ed 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -176,24 +176,26 @@ void __delete_from_swap_cache(struct page *page, } /** - * add_to_swap - allocate swap space for a page - * @page: page we want to move to swap + * add_to_swap - allocate swap space for a folio + * @folio: folio we want to move to swap * - * Allocate swap space for the page and add the page to the - * swap cache. Caller needs to hold the page lock. + * Allocate swap space for the folio and add the folio to the + * swap cache. + * + * Context: Caller needs to hold the folio lock. + * Return: Whether the folio was added to the swap cache. */ -int add_to_swap(struct page *page) +bool add_to_swap(struct folio *folio) { - struct folio *folio = page_folio(page); swp_entry_t entry; int err; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageUptodate(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); entry = folio_alloc_swap(folio); if (!entry.val) - return 0; + return false; /* * XArray node allocations from PF_MEMALLOC contexts could @@ -206,7 +208,7 @@ int add_to_swap(struct page *page) /* * Add it to the swap cache. */ - err = add_to_swap_cache(page, entry, + err = add_to_swap_cache(&folio->page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* @@ -215,22 +217,23 @@ int add_to_swap(struct page *page) */ goto fail; /* - * Normally the page will be dirtied in unmap because its pte should be - * dirty. A special case is MADV_FREE page. The page's pte could have - * dirty bit cleared but the page's SwapBacked bit is still set because - * clearing the dirty bit and SwapBacked bit has no lock protected. For - * such page, unmap will not set dirty bit for it, so page reclaim will - * not write the page out. This can cause data corruption when the page - * is swap in later. Always setting the dirty bit for the page solves - * the problem. + * Normally the folio will be dirtied in unmap because its + * pte should be dirty. A special case is MADV_FREE page. The + * page's pte could have dirty bit cleared but the folio's + * SwapBacked flag is still set because clearing the dirty bit + * and SwapBacked flag has no lock protected. For such folio, + * unmap will not set dirty bit for it, so folio reclaim will + * not write the folio out. This can cause data corruption when + * the folio is swapped in later. Always setting the dirty flag + * for the folio solves the problem. */ - set_page_dirty(page); + folio_mark_dirty(folio); - return 1; + return true; fail: - put_swap_page(page, entry); - return 0; + put_swap_page(&folio->page, entry); + return false; } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 2837e7e3677c..f2fc5aa38cb8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1731,8 +1731,8 @@ retry: page_list)) goto activate_locked; } - if (!add_to_swap(page)) { - if (!PageTransHuge(page)) + if (!add_to_swap(folio)) { + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, @@ -1741,7 +1741,7 @@ retry: #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); #endif - if (!add_to_swap(page)) + if (!add_to_swap(folio)) goto activate_locked_split; } -- cgit v1.2.3 From 49bd2bf9679f4a5b30236546fca61e66b989ce96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: vmscan: convert dirty page handling to folios Mostly this just eliminates calls to compound_head(), but NR_VMSCAN_IMMEDIATE was being incremented by 1 instead of by nr_pages. Link: https://lkml.kernel.org/r/20220504182857.4013401-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f2fc5aa38cb8..773b88c8e7f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1787,28 +1787,31 @@ retry: } } - if (PageDirty(page)) { + if (folio_test_dirty(folio)) { /* - * Only kswapd can writeback filesystem pages + * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid - * injecting inefficient single-page IO into + * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only - * write pages when we've encountered many - * dirty pages, and when we've already scanned - * the rest of the LRU for clean pages and see - * the same dirty pages again (PageReclaim). + * write folios when we've encountered many + * dirty folios, and when we've already scanned + * the rest of the LRU for clean folios and see + * the same dirty folios again (with the reclaim + * flag set). */ - if (page_is_file_lru(page) && - (!current_is_kswapd() || !PageReclaim(page) || + if (folio_is_file_lru(folio) && + (!current_is_kswapd() || + !folio_test_reclaim(folio) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated + * Similar in principle to deactivate_page() + * except we already have the folio isolated * and know it's dirty */ - inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); + node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, + nr_pages); + folio_set_reclaim(folio); goto activate_locked; } @@ -1821,8 +1824,8 @@ retry: goto keep_locked; /* - * Page is dirty. Flush the TLB if a writable entry - * potentially exists to avoid CPU writes after IO + * Folio is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after I/O * starts and then write it out here. */ try_to_unmap_flush_dirty(); @@ -1834,23 +1837,24 @@ retry: case PAGE_SUCCESS: stat->nr_pageout += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) goto keep; - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go - * ahead and try to reclaim the page. + * ahead and try to reclaim the folio. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - if (PageDirty(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || + folio_test_writeback(folio)) goto keep_locked; - mapping = page_mapping(page); + mapping = folio_mapping(folio); fallthrough; case PAGE_CLEAN: - ; /* try to free the page below */ + ; /* try to free the folio below */ } } -- cgit v1.2.3 From 0a36111c8c20b2edc7c83f084bdba2be9d42c1e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: vmscan: convert page buffer handling to use folios This mostly just removes calls to compound_head() although nr_reclaimed should be incremented by the number of pages, not just 1. Link: https://lkml.kernel.org/r/20220504182857.4013401-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 773b88c8e7f8..00b985caf40d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1859,38 +1859,40 @@ retry: } /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. + * If the folio has buffers, try to free the buffer + * mappings associated with this folio. If we succeed + * we try to free the folio as well. * - * We do this even if the page is PageDirty(). - * try_to_release_page() does not perform I/O, but it is - * possible for a page to have PageDirty set, but it is actually - * clean (all its buffers are clean). This happens if the - * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. - * try_to_release_page() will discover that cleanness and will - * drop the buffers and mark the page clean - it can be freed. + * We do this even if the folio is dirty. + * filemap_release_folio() does not perform I/O, but it + * is possible for a folio to have the dirty flag set, + * but it is actually clean (all its buffers are clean). + * This happens if the buffers were written out directly, + * with submit_bh(). ext3 will do this, as well as + * the blockdev mapping. filemap_release_folio() will + * discover that cleanness and will drop the buffers + * and mark the folio clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are - * the pages which were not successfully invalidated in - * truncate_cleanup_page(). We try to drop those buffers here - * and if that worked, and the page is no longer mapped into - * process address space (page_count == 1) it can be freed. - * Otherwise, leave the page on the LRU so it is swappable. + * Rarely, folios can have buffers and no ->mapping. + * These are the folios which were not successfully + * invalidated in truncate_cleanup_folio(). We try to + * drop those buffers here and if that worked, and the + * folio is no longer mapped into process address space + * (refcount == 1) it can be freed. Otherwise, leave + * the folio on the LRU so it is swappable. */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) + if (folio_has_private(folio)) { + if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) + if (!mapping && folio_ref_count(folio) == 1) { + folio_unlock(folio); + if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free - * this page shortly, so we may + * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ -- cgit v1.2.3 From 64daa5d818ae3430f0785206b0af13ef528cb9ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: convert lazy freeing to folios Remove a hidden call to compound_head(), and account nr_pages instead of a single page. This matches the code in lru_lazyfree_fn() that accounts nr_pages to PGLAZYFREE. Link: https://lkml.kernel.org/r/20220504182857.4013401-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++++++++++++++ mm/vmscan.c | 18 +++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fe580cb96683..8ea4b541c31e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1057,6 +1057,15 @@ static inline void count_memcg_page_event(struct page *page, count_memcg_events(memcg, idx, 1); } +static inline void count_memcg_folio_events(struct folio *folio, + enum vm_event_item idx, unsigned long nr) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + if (memcg) + count_memcg_events(memcg, idx, nr); +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { @@ -1494,6 +1503,11 @@ static inline void count_memcg_page_event(struct page *page, { } +static inline void count_memcg_folio_events(struct folio *folio, + enum vm_event_item idx, unsigned long nr) +{ +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 00b985caf40d..972557e2fb92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1902,20 +1902,20 @@ retry: } } - if (PageAnon(page) && !PageSwapBacked(page)) { + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ - if (!page_ref_freeze(page, 1)) + if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* - * The page has only one reference left, which is + * The folio has only one reference left, which is * from the isolation. After the caller puts the - * page back on lru and drops the reference, the - * page will be freed anyway. It doesn't matter - * which lru it goes. So we don't bother checking - * PageDirty here. + * folio back on the lru and drops the reference, the + * folio will be freed anyway. It doesn't matter + * which lru it goes on. So we don't bother checking + * the dirty flag here. */ - count_vm_event(PGLAZYFREED); - count_memcg_page_event(page, PGLAZYFREED); + count_vm_events(PGLAZYFREED, nr_pages); + count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; -- cgit v1.2.3 From 5441d4902f969293b5bfe057e26038db1a8b342e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: move initialisation of mapping down Now that we don't interrogate the BDI for congestion, we can delay looking up the folio's mapping until we've got further through the function, reducing register pressure and saving a call to folio_mapping for folios we're adding to the swap cache. Link: https://lkml.kernel.org/r/20220504182857.4013401-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 972557e2fb92..725a291e9dad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1588,12 +1588,11 @@ retry: stat->nr_unqueued_dirty += nr_pages; /* - * Treat this page as congested if the underlying BDI is or if + * Treat this page as congested if * pages are cycling through the LRU so quickly that the * pages marked for immediate reclaim are making it to the * end of the LRU a second time. */ - mapping = page_mapping(page); if (writeback && PageReclaim(page)) stat->nr_congested += nr_pages; @@ -1744,9 +1743,6 @@ retry: if (!add_to_swap(folio)) goto activate_locked_split; } - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); } } else if (PageSwapBacked(page) && PageTransHuge(page)) { /* Split shmem THP */ @@ -1787,6 +1783,7 @@ retry: } } + mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* * Only kswapd can writeback filesystem folios -- cgit v1.2.3 From 246b648038096c6024a812aac354d27e8da987a2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: convert the activate_locked portion of shrink_page_list to folios This accounts the number of pages activated correctly for large folios. Link: https://lkml.kernel.org/r/20220504182857.4013401-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 725a291e9dad..0a527cebdff2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1946,15 +1946,16 @@ activate_locked_split: } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || - PageMlocked(page))) - try_to_free_swap(page); - VM_BUG_ON_PAGE(PageActive(page), page); - if (!PageMlocked(page)) { - int type = page_is_file_lru(page); - SetPageActive(page); + if (folio_test_swapcache(folio) && + (mem_cgroup_swap_full(&folio->page) || + folio_test_mlocked(folio))) + try_to_free_swap(&folio->page); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + if (!folio_test_mlocked(folio)) { + int type = folio_is_file_lru(folio); + folio_set_active(folio); stat->nr_activate[type] += nr_pages; - count_memcg_page_event(page, PGACTIVATE); + count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: unlock_page(page); -- cgit v1.2.3 From dc786690a6a1d9a680e6872821291ad7ca9f520d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: mm: allow can_split_folio() to be called when THP are disabled The call to can_split_folio() in vmscan is currently guarded by a test of PageTransHuge() so the BUILD_BUG() is eliminated if THP are disabled. The next patch replaces that test with folio_test_large() which may be true, even when THP are disabled. However, if THP are disabled, we cannot split, so an unconditional return of false is appropriate. Link: https://lkml.kernel.org/r/20220504182857.4013401-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9a26bd10e083..fbf36bb1be22 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -348,7 +348,6 @@ static inline void prep_transhuge_page(struct page *page) {} static inline bool can_split_folio(struct folio *folio, int *pextra_pins) { - BUILD_BUG(); return false; } static inline int -- cgit v1.2.3 From c28a0e9695b724fbaa58b1f5bbf0a03c5a79d721 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: remove remaining uses of page in shrink_page_list These are all straightforward conversions to the folio API. Link: https://lkml.kernel.org/r/20220504182857.4013401-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 122 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 60 insertions(+), 62 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0a527cebdff2..24dbe04520cb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1507,11 +1507,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return nr_succeeded; } -static bool may_enter_fs(struct page *page, gfp_t gfp_mask) +static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) { if (gfp_mask & __GFP_FS) return true; - if (!PageSwapCache(page) || !(gfp_mask & __GFP_IO)) + if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) return false; /* * We can "enter_fs" for swap-cache with only __GFP_IO @@ -1520,7 +1520,7 @@ static bool may_enter_fs(struct page *page, gfp_t gfp_mask) * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ - return !data_race(page_swap_flags(page) & SWP_FS_OPS); + return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS); } /* @@ -1547,7 +1547,6 @@ static unsigned int shrink_page_list(struct list_head *page_list, retry: while (!list_empty(page_list)) { struct address_space *mapping; - struct page *page; struct folio *folio; enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; @@ -1557,19 +1556,18 @@ retry: folio = lru_to_folio(page_list); list_del(&folio->lru); - page = &folio->page; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); - /* Account the number of base pages even though THP */ + /* Account the number of base pages */ sc->nr_scanned += nr_pages; - if (unlikely(!page_evictable(page))) + if (unlikely(!folio_evictable(folio))) goto activate_locked; if (!sc->may_unmap && folio_mapped(folio)) @@ -1578,7 +1576,7 @@ retry: /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing - * pages if the tail of the LRU is all dirty unqueued pages. + * folios if the tail of the LRU is all dirty unqueued folios. */ folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) @@ -1588,21 +1586,21 @@ retry: stat->nr_unqueued_dirty += nr_pages; /* - * Treat this page as congested if - * pages are cycling through the LRU so quickly that the - * pages marked for immediate reclaim are making it to the - * end of the LRU a second time. + * Treat this folio as congested if folios are cycling + * through the LRU so quickly that the folios marked + * for immediate reclaim are making it to the end of + * the LRU a second time. */ - if (writeback && PageReclaim(page)) + if (writeback && folio_test_reclaim(folio)) stat->nr_congested += nr_pages; /* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * - * 1) If reclaim is encountering an excessive number of folios - * under writeback and this folio is both under - * writeback and has the reclaim flag set then it + * 1) If reclaim is encountering an excessive number + * of folios under writeback and this folio has both + * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an @@ -1651,19 +1649,19 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(page, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask)) { /* * This is slightly racy - - * folio_end_writeback() might have just - * cleared the reclaim flag, then setting - * reclaim here ends up interpreted as - * the readahead flag - but that does - * not matter enough to care. What we - * do want is for this folio to have - * the reclaim flag set next time memcg - * reclaim reaches the tests above, so - * it will then folio_wait_writeback() - * to avoid OOM; and it's also appropriate + * folio_end_writeback() might have + * just cleared the reclaim flag, then + * setting the reclaim flag here ends up + * interpreted as the readahead flag - but + * that does not matter enough to care. + * What we do want is for this folio to + * have the reclaim flag set next time + * memcg reclaim reaches the tests above, + * so it will then wait for writeback to + * avoid OOM; and it's also appropriate * in global reclaim. */ folio_set_reclaim(folio); @@ -1691,37 +1689,37 @@ retry: goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: - ; /* try to reclaim the page below */ + ; /* try to reclaim the folio below */ } /* - * Before reclaiming the page, try to relocate + * Before reclaiming the folio, try to relocate * its contents to another node. */ if (do_demote_pass && - (thp_migration_supported() || !PageTransHuge(page))) { - list_add(&page->lru, &demote_pages); - unlock_page(page); + (thp_migration_supported() || !folio_test_large(folio))) { + list_add(&folio->lru, &demote_pages); + folio_unlock(folio); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. - * Lazyfree page could be freed directly + * Lazyfree folio could be freed directly */ - if (PageAnon(page) && PageSwapBacked(page)) { - if (!PageSwapCache(page)) { + if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { + if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; - if (PageTransHuge(page)) { - /* cannot split THP, skip it */ + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ if (!can_split_folio(folio, NULL)) goto activate_locked; /* - * Split pages without a PMD map right + * Split folios without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ @@ -1744,20 +1742,19 @@ retry: goto activate_locked_split; } } - } else if (PageSwapBacked(page) && PageTransHuge(page)) { - /* Split shmem THP */ + } else if (folio_test_swapbacked(folio) && + folio_test_large(folio)) { + /* Split shmem folio */ if (split_folio_to_list(folio, page_list)) goto keep_locked; } /* - * THP may get split above, need minus tail pages and update - * nr_pages to avoid accounting tail pages twice. - * - * The tail pages that are added into swap cache successfully - * reach here. + * If the folio was split above, the tail pages will make + * their own pass through this function and be accounted + * then. */ - if ((nr_pages > 1) && !PageTransHuge(page)) { + if ((nr_pages > 1) && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } @@ -1815,7 +1812,7 @@ retry: if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; - if (!may_enter_fs(page, sc->gfp_mask)) + if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; @@ -1917,11 +1914,11 @@ retry: sc->target_mem_cgroup)) goto keep_locked; - unlock_page(page); + folio_unlock(folio); free_it: /* - * THP may get swapped out in a whole, need account - * all base pages. + * Folio may get swapped out as a whole, need to account + * all pages in it. */ nr_reclaimed += nr_pages; @@ -1929,10 +1926,10 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) - destroy_compound_page(page); + if (unlikely(folio_test_large(folio))) + destroy_compound_page(&folio->page); else - list_add(&page->lru, &free_pages); + list_add(&folio->lru, &free_pages); continue; activate_locked_split: @@ -1958,18 +1955,19 @@ activate_locked: count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: - unlock_page(page); + folio_unlock(folio); keep: - list_add(&page->lru, &ret_pages); - VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); + list_add(&folio->lru, &ret_pages); + VM_BUG_ON_FOLIO(folio_test_lru(folio) || + folio_test_unevictable(folio), folio); } /* 'page_list' is always empty here */ - /* Migrate pages selected for demotion */ + /* Migrate folios selected for demotion */ nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Pages that could not be demoted are still in @demote_pages */ + /* Folios that could not be demoted are still in @demote_pages */ if (!list_empty(&demote_pages)) { - /* Pages which failed to demoted go back on @page_list for retry: */ + /* Folios which weren't demoted go back on @page_list for retry: */ list_splice_init(&demote_pages, page_list); do_demote_pass = false; goto retry; -- cgit v1.2.3 From 0562457186752b6a65eeca9f5ed0fb9b1e2572e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: mm/shmem: use a folio in shmem_unused_huge_shrink When calling split_huge_page() we usually have to find the precise page, but that's not necessary here because we only need to unlock and put the folio afterwards. Saves 231 bytes of text (20% of this function). Link: https://lkml.kernel.org/r/20220504182857.4013401-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7cfe76347f63..6cba5fd9b694 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -554,7 +554,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; - struct page *page; + struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; int split = 0; @@ -598,6 +598,7 @@ next: list_for_each_safe(pos, next, &list) { int ret; + pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; @@ -605,14 +606,14 @@ next: if (nr_to_split && split >= nr_to_split) goto move_back; - page = find_get_page(inode->i_mapping, - (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); - if (!page) + index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; + folio = filemap_get_folio(inode->i_mapping, index); + if (!folio) goto drop; /* No huge page at the end of the file: nothing to split */ - if (!PageTransHuge(page)) { - put_page(page); + if (!folio_test_large(folio)) { + folio_put(folio); goto drop; } @@ -623,14 +624,14 @@ next: * Waiting for the lock may lead to deadlock in the * reclaim path. */ - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); goto move_back; } - ret = split_huge_page(page); - unlock_page(page); - put_page(page); + ret = split_huge_page(&folio->page); + folio_unlock(folio); + folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ if (ret) -- cgit v1.2.3 From 039bc1240165c99b932fb4be2f784948d1038eea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:04 -0700 Subject: mm/swap: add folio_throttle_swaprate The only use of the page argument to cgroup_throttle_swaprate() is to get the node ID, and this will be the same for all pages in the folio, so just pass in the first page of the folio. Link: https://lkml.kernel.org/r/20220504182857.4013401-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/swap.h b/include/linux/swap.h index 252c23a6667f..6bf1506e5080 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -640,6 +640,10 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { } #endif +static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) +{ + cgroup_throttle_swaprate(&folio->page, gfp); +} #ifdef CONFIG_MEMCG_SWAP void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); -- cgit v1.2.3 From b7dd44a12cf26603712e60f1e1449c87d5f8cb79 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:04 -0700 Subject: mm/shmem: convert shmem_add_to_page_cache to take a folio Shrinks shmem_add_to_page_cache() by 16 bytes. All the callers grow, but this is temporary as they will all be converted to folios soon. Link: https://lkml.kernel.org/r/20220504182857.4013401-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 57 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6cba5fd9b694..2e800d4e0d03 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -696,36 +696,35 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* * Like add_to_page_cache_locked, but error if expected item has gone. */ -static int shmem_add_to_page_cache(struct page *page, +static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t index, void *expected, gfp_t gfp, struct mm_struct *charge_mm) { - XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); - unsigned long nr = compound_nr(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); + long nr = folio_nr_pages(folio); int error; - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(index != round_down(index, nr), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - VM_BUG_ON(expected && PageTransHuge(page)); + VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + VM_BUG_ON(expected && folio_test_large(folio)); - page_ref_add(page, nr); - page->mapping = mapping; - page->index = index; + folio_ref_add(folio, nr); + folio->mapping = mapping; + folio->index = index; - if (!PageSwapCache(page)) { - error = mem_cgroup_charge(page_folio(page), charge_mm, gfp); + if (!folio_test_swapcache(folio)) { + error = mem_cgroup_charge(folio, charge_mm, gfp); if (error) { - if (PageTransHuge(page)) { + if (folio_test_pmd_mappable(folio)) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } goto error; } } - cgroup_throttle_swaprate(page, gfp); + folio_throttle_swaprate(folio, gfp); do { xas_lock_irq(&xas); @@ -737,16 +736,16 @@ static int shmem_add_to_page_cache(struct page *page, xas_set_err(&xas, -EEXIST); goto unlock; } - xas_store(&xas, page); + xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (PageTransHuge(page)) { + if (folio_test_pmd_mappable(folio)) { count_vm_event(THP_FILE_ALLOC); - __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); } mapping->nrpages += nr; - __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); - __mod_lruvec_page_state(page, NR_SHMEM, nr); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -758,8 +757,8 @@ unlock: return 0; error: - page->mapping = NULL; - page_ref_sub(page, nr); + folio->mapping = NULL; + folio_ref_sub(folio, nr); return error; } @@ -1691,7 +1690,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct page *page; + struct page *page = NULL; + struct folio *folio; swp_entry_t swap; int error; @@ -1741,7 +1741,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = shmem_add_to_page_cache(page, mapping, index, + folio = page_folio(page); + error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp, charge_mm); if (error) @@ -1792,6 +1793,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; + struct folio *folio; struct page *page; pgoff_t hindex = index; gfp_t huge_gfp; @@ -1906,7 +1908,8 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = shmem_add_to_page_cache(page, mapping, hindex, + folio = page_folio(page); + error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp & GFP_RECLAIM_MASK, charge_mm); if (error) @@ -2328,6 +2331,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; + struct folio *folio; struct page *page; int ret; pgoff_t max_off; @@ -2386,7 +2390,8 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(pgoff >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + folio = page_folio(page); + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; -- cgit v1.2.3 From 069d849cde3a02c075548ac68a43ed97fc95ee34 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:04 -0700 Subject: mm/shmem: turn shmem_should_replace_page into shmem_should_replace_folio This is a straightforward conversion. Link: https://lkml.kernel.org/r/20220504182857.4013401-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2e800d4e0d03..d11dc37d332f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1601,9 +1601,9 @@ failed: * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ -static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { - return page_zonenum(page) > gfp_zone(gfp); + return folio_zonenum(folio) > gfp_zone(gfp); } static int shmem_replace_page(struct page **pagep, gfp_t gfp, @@ -1735,13 +1735,13 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, */ arch_swap_restore(swap, page); - if (shmem_should_replace_page(page, gfp)) { + folio = page_folio(page); + if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_page(&page, gfp, info, index); if (error) goto failed; } - folio = page_folio(page); error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp, charge_mm); -- cgit v1.2.3 From 0c023ef52d769ea064df2c86bcdf29cbedb0a9b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:04 -0700 Subject: mm/shmem: add shmem_alloc_folio() Call vma_alloc_folio() directly instead of alloc_page_vma(). Add a shmem_alloc_page() wrapper to avoid changing the callers. Link: https://lkml.kernel.org/r/20220504182857.4013401-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d11dc37d332f..668d054728bc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1544,17 +1544,23 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return &folio->page; } -static struct page *shmem_alloc_page(gfp_t gfp, +static struct folio *shmem_alloc_folio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct page *page; + struct folio *folio; shmem_pseudo_vma_init(&pvma, info, index); - page = alloc_page_vma(gfp, &pvma, 0); + folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); shmem_pseudo_vma_destroy(&pvma); - return page; + return folio; +} + +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + return &shmem_alloc_folio(gfp, info, index)->page; } static struct page *shmem_alloc_and_acct_page(gfp_t gfp, -- cgit v1.2.3 From 72827e5c2bcb86d56f8a8aa78fde0085d8535567 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:04 -0700 Subject: mm/shmem: convert shmem_alloc_and_acct_page to use a folio Convert shmem_alloc_hugepage() to return the folio that it uses and use a folio throughout shmem_alloc_and_acct_page(). Continue to return a page from shmem_alloc_and_acct_page() for now. Link: https://lkml.kernel.org/r/20220504182857.4013401-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 668d054728bc..d5b23932357d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1523,7 +1523,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct page *shmem_alloc_hugepage(gfp_t gfp, +static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; @@ -1541,7 +1541,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_destroy(&pvma); if (!folio) count_vm_event(THP_FILE_FALLBACK); - return &folio->page; + return folio; } static struct folio *shmem_alloc_folio(gfp_t gfp, @@ -1568,7 +1568,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, pgoff_t index, bool huge) { struct shmem_inode_info *info = SHMEM_I(inode); - struct page *page; + struct folio *folio; int nr; int err = -ENOSPC; @@ -1580,13 +1580,13 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, goto failed; if (huge) - page = shmem_alloc_hugepage(gfp, info, index); + folio = shmem_alloc_hugefolio(gfp, info, index); else - page = shmem_alloc_page(gfp, info, index); - if (page) { - __SetPageLocked(page); - __SetPageSwapBacked(page); - return page; + folio = shmem_alloc_folio(gfp, info, index); + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + return &folio->page; } err = -ENOMEM; -- cgit v1.2.3 From b1d0ec3a9a250b2d5ddd790fdaa2245432a903a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm/shmem: convert shmem_getpage_gfp to use a folio Rename shmem_alloc_and_acct_page() to shmem_alloc_and_acct_folio() and have it return a folio, then use a folio throuughout shmem_getpage_gfp(). It continues to return a struct page. Link: https://lkml.kernel.org/r/20220504182857.4013401-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 95 ++++++++++++++++++++++++++++---------------------------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d5b23932357d..edcaba7c67ff 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1563,8 +1563,7 @@ static struct page *shmem_alloc_page(gfp_t gfp, return &shmem_alloc_folio(gfp, info, index)->page; } -static struct page *shmem_alloc_and_acct_page(gfp_t gfp, - struct inode *inode, +static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) { struct shmem_inode_info *info = SHMEM_I(inode); @@ -1586,7 +1585,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); - return &folio->page; + return folio; } err = -ENOMEM; @@ -1800,7 +1799,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct folio *folio; - struct page *page; pgoff_t hindex = index; gfp_t huge_gfp; int error; @@ -1818,19 +1816,18 @@ repeat: sbinfo = SHMEM_SB(inode->i_sb); charge_mm = vma ? vma->vm_mm : NULL; - page = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); - - if (page && vma && userfaultfd_minor(vma)) { - if (!xa_is_value(page)) { - unlock_page(page); - put_page(page); + folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0); + if (folio && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(folio)) { + folio_unlock(folio); + folio_put(folio); } *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); return 0; } - if (xa_is_value(page)) { + if (xa_is_value(folio)) { + struct page *page = &folio->page; error = shmem_swapin_page(inode, index, &page, sgp, gfp, vma, fault_type); if (error == -EEXIST) @@ -1840,17 +1837,17 @@ repeat: return error; } - if (page) { - hindex = page->index; + if (folio) { + hindex = folio->index; if (sgp == SGP_WRITE) - mark_page_accessed(page); - if (PageUptodate(page)) + folio_mark_accessed(folio); + if (folio_test_uptodate(folio)) goto out; /* fallocated page */ if (sgp != SGP_READ) goto clear; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } /* @@ -1877,17 +1874,16 @@ repeat: huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); - if (IS_ERR(page)) { + folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); + if (IS_ERR(folio)) { alloc_nohuge: - page = shmem_alloc_and_acct_page(gfp, inode, - index, false); + folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); } - if (IS_ERR(page)) { + if (IS_ERR(folio)) { int retry = 5; - error = PTR_ERR(page); - page = NULL; + error = PTR_ERR(folio); + folio = NULL; if (error != -ENOSPC) goto unlock; /* @@ -1906,30 +1902,26 @@ alloc_nohuge: goto unlock; } - if (PageTransHuge(page)) - hindex = round_down(index, HPAGE_PMD_NR); - else - hindex = index; + hindex = round_down(index, folio_nr_pages(folio)); if (sgp == SGP_WRITE) - __SetPageReferenced(page); + __folio_set_referenced(folio); - folio = page_folio(page); error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp & GFP_RECLAIM_MASK, charge_mm); if (error) goto unacct; - lru_cache_add(page); + folio_add_lru(folio); spin_lock_irq(&info->lock); - info->alloced += compound_nr(page); - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); + info->alloced += folio_nr_pages(folio); + inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); alloced = true; - if (PageTransHuge(page) && + if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < hindex + HPAGE_PMD_NR - 1) { /* @@ -1960,22 +1952,21 @@ clear: * but SGP_FALLOC on a page fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ - if (sgp != SGP_WRITE && !PageUptodate(page)) { - int i; + if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { + long i, n = folio_nr_pages(folio); - for (i = 0; i < compound_nr(page); i++) { - clear_highpage(page + i); - flush_dcache_page(page + i); - } - SetPageUptodate(page); + for (i = 0; i < n; i++) + clear_highpage(folio_page(folio, i)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { - ClearPageDirty(page); - delete_from_page_cache(page); + folio_clear_dirty(folio); + filemap_remove_folio(folio); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1984,24 +1975,24 @@ clear: goto unlock; } out: - *pagep = page + index - hindex; + *pagep = folio_page(folio, index - hindex); return 0; /* * Error recovery. */ unacct: - shmem_inode_unacct_blocks(inode, compound_nr(page)); + shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - if (PageTransHuge(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); goto alloc_nohuge; } unlock: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } if (error == -ENOSPC && !once++) { spin_lock_irq(&info->lock); -- cgit v1.2.3 From da08e9b7932345aff0a748c55f8a25709505f2a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm/shmem: convert shmem_swapin_page() to shmem_swapin_folio() shmem_swapin_page() only brings in order-0 pages, which are folios by definition. Link: https://lkml.kernel.org/r/20220504182857.4013401-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 6 +-- include/linux/pgtable.h | 2 +- mm/shmem.c | 110 ++++++++++++++++++--------------------- 3 files changed, 55 insertions(+), 63 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 3278c6c0d873..8ebf1cec5d90 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -981,10 +981,10 @@ static inline void arch_swap_invalidate_area(int type) } #define __HAVE_ARCH_SWAP_RESTORE -static inline void arch_swap_restore(swp_entry_t entry, struct page *page) +static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { - if (system_supports_mte() && mte_restore_tags(entry, page)) - set_bit(PG_mte_tagged, &page->flags); + if (system_supports_mte() && mte_restore_tags(entry, &folio->page)) + set_bit(PG_mte_tagged, &folio->flags); } #endif /* CONFIG_ARM64_MTE */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 530b1817b58c..39fa93e94b80 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -758,7 +758,7 @@ static inline void arch_swap_invalidate_area(int type) #endif #ifndef __HAVE_ARCH_SWAP_RESTORE -static inline void arch_swap_restore(swp_entry_t entry, struct page *page) +static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif diff --git a/mm/shmem.c b/mm/shmem.c index edcaba7c67ff..29701be579f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -135,8 +135,8 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static int shmem_swapin_page(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, @@ -1159,69 +1159,64 @@ static void shmem_evict_inode(struct inode *inode) } static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, unsigned int nr_entries, - struct page **entries, pgoff_t *indices, - unsigned int type) + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; swp_entry_t entry; unsigned int ret = 0; - if (!nr_entries) - return 0; - rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) continue; - if (!xa_is_value(page)) + if (!xa_is_value(folio)) continue; - entry = radix_to_swp_entry(page); + entry = radix_to_swp_entry(folio); if (swp_type(entry) != type) continue; indices[ret] = xas.xa_index; - entries[ret] = page; + if (!folio_batch_add(fbatch, folio)) + break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } - if (++ret == nr_entries) - break; } rcu_read_unlock(); - return ret; + return xas.xa_index; } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ -static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, - pgoff_t *indices) +static int shmem_unuse_swap_entries(struct inode *inode, + struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; - for (i = 0; i < pvec.nr; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(page)) + if (!xa_is_value(folio)) continue; - error = shmem_swapin_page(inode, indices[i], - &page, SGP_CACHE, + error = shmem_swapin_folio(inode, indices[i], + &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ret++; } if (error == -ENOMEM) @@ -1238,26 +1233,23 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; int ret = 0; - pagevec_init(&pvec); do { - unsigned int nr_entries = PAGEVEC_SIZE; - - pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, - pvec.pages, indices, type); - if (pvec.nr == 0) { + folio_batch_init(&fbatch); + shmem_find_swap_entries(mapping, start, &fbatch, indices, type); + if (folio_batch_count(&fbatch) == 0) { ret = 0; break; } - ret = shmem_unuse_swap_entries(inode, pvec, indices); + ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; - start = indices[pvec.nr - 1]; + start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; @@ -1687,22 +1679,22 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Returns 0 and the page in pagep if success. On failure, returns the * error code and NULL in *pagep. */ -static int shmem_swapin_page(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct page *page = NULL; - struct folio *folio; + struct page *page; + struct folio *folio = NULL; swp_entry_t swap; int error; - VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); - swap = radix_to_swp_entry(*pagep); - *pagep = NULL; + VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); + swap = radix_to_swp_entry(*foliop); + *foliop = NULL; /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); @@ -1720,27 +1712,28 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } } + folio = page_folio(page); /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || + folio_lock(folio); + if (!folio_test_swapcache(folio) || + folio_swap_entry(folio).val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; goto unlock; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* * Some architectures may have to restore extra metadata to the - * physical page after reading from swap. + * folio after reading from swap. */ - arch_swap_restore(swap, page); + arch_swap_restore(swap, folio); - folio = page_folio(page); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_page(&page, gfp, info, index); if (error) @@ -1759,21 +1752,21 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, spin_unlock_irq(&info->lock); if (sgp == SGP_WRITE) - mark_page_accessed(page); + folio_mark_accessed(folio); - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(&folio->page); + folio_mark_dirty(folio); swap_free(swap); - *pagep = page; + *foliop = folio; return 0; failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; unlock: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } return error; @@ -1827,13 +1820,12 @@ repeat: } if (xa_is_value(folio)) { - struct page *page = &folio->page; - error = shmem_swapin_page(inode, index, &page, + error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; - *pagep = page; + *pagep = &folio->page; return error; } -- cgit v1.2.3 From a9595b305c0f6cbbf9505325509d95637f6a7062 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm: add folio_mapping_flags() This is the equivalent of PageMappingFlags and is needed for converting mm/migrate.c to folios. Link: https://lkml.kernel.org/r/20220504182857.4013401-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b70124b9c7c1..97eb88dbcbbc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -650,6 +650,11 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +static __always_inline bool folio_mapping_flags(struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; +} + static __always_inline int PageMappingFlags(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; -- cgit v1.2.3 From 8b463be3a0241558071e6ba9bf11e4bf42ccc856 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm: add folio_test_movable() This is the folio equivalent of PageMovable() which is needed to convert mm/migrate.c to folios. Link: https://lkml.kernel.org/r/20220504182857.4013401-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/migrate.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2707bfd43a0d..069a89e847f3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -105,6 +105,11 @@ static inline void __ClearPageMovable(struct page *page) } #endif +static inline bool folio_test_movable(struct folio *folio) +{ + return PageMovable(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); -- cgit v1.2.3 From e7e3ffeb274f1ff5bc68bb9135128e1ba14a7d53 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm/migrate: convert move_to_new_page() into move_to_new_folio() Pass in the folios that we already have in each caller. Saves a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220504182857.4013401-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/migrate.c | 58 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ab46ad93af9f..1415ea25f7aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -843,21 +843,21 @@ static int fallback_migrate_page(struct address_space *mapping, * < 0 - error code * MIGRATEPAGE_SUCCESS - success */ -static int move_to_new_page(struct page *newpage, struct page *page, +static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { struct address_space *mapping; int rc = -EAGAIN; - bool is_lru = !__PageMovable(page); + bool is_lru = !__PageMovable(&src->page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_FOLIO(!folio_test_locked(src), src); + VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); - mapping = page_mapping(page); + mapping = folio_mapping(src); if (likely(is_lru)) { if (!mapping) - rc = migrate_page(mapping, newpage, page, mode); + rc = migrate_page(mapping, &dst->page, &src->page, mode); else if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems @@ -866,54 +866,54 @@ static int move_to_new_page(struct page *newpage, struct page *page, * migratepage callback. This is the most common path * for page migration. */ - rc = mapping->a_ops->migratepage(mapping, newpage, - page, mode); + rc = mapping->a_ops->migratepage(mapping, &dst->page, + &src->page, mode); else - rc = fallback_migrate_page(mapping, newpage, - page, mode); + rc = fallback_migrate_page(mapping, &dst->page, + &src->page, mode); } else { /* * In case of non-lru page, it could be released after * isolation step. In that case, we shouldn't try migration. */ - VM_BUG_ON_PAGE(!PageIsolated(page), page); - if (!PageMovable(page)) { + VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); + if (!folio_test_movable(src)) { rc = MIGRATEPAGE_SUCCESS; - ClearPageIsolated(page); + folio_clear_isolated(src); goto out; } - rc = mapping->a_ops->migratepage(mapping, newpage, - page, mode); + rc = mapping->a_ops->migratepage(mapping, &dst->page, + &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && - !PageIsolated(page)); + !folio_test_isolated(src)); } /* - * When successful, old pagecache page->mapping must be cleared before - * page is freed; but stats require that PageAnon be left as PageAnon. + * When successful, old pagecache src->mapping must be cleared before + * src is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (__PageMovable(page)) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); + if (__PageMovable(&src->page)) { + VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); /* * We clear PG_movable under page_lock so any compactor * cannot try to migrate this page. */ - ClearPageIsolated(page); + folio_clear_isolated(src); } /* - * Anonymous and movable page->mapping will be cleared by + * Anonymous and movable src->mapping will be cleared by * free_pages_prepare so don't reset it here for keeping * the type to work PageAnon, for example. */ - if (!PageMappingFlags(page)) - page->mapping = NULL; + if (!folio_mapping_flags(src)) + src->mapping = NULL; - if (likely(!is_zone_device_page(newpage))) - flush_dcache_folio(page_folio(newpage)); + if (likely(!folio_is_zone_device(dst))) + flush_dcache_folio(dst); } out: return rc; @@ -1001,7 +1001,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_page(newpage, page, mode); + rc = move_to_new_folio(dst, folio, mode); goto out_unlock_both; } @@ -1032,7 +1032,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, mode); + rc = move_to_new_folio(dst, folio, mode); /* * When successful, push newpage to LRU immediately: so that if it @@ -1261,7 +1261,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, mode); + rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) remove_migration_ptes(src, -- cgit v1.2.3 From 92fb05242a1b1ecfcb39d9b1421a165adf344a3c Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: using PxD_SIZE instead of PxD_PAGE_SIZE Patch series "mm: page_table_check: add support on arm64 and riscv", v7. Page table check performs extra verifications at the time when new pages become accessible from the userspace by getting their page table entries (PTEs PMDs etc.) added into the table. It is supported on X86[1]. This patchset made some simple changes and make it easier to support new architecture, then we support this feature on ARM64 and RISCV. [1]https://lore.kernel.org/lkml/20211123214814.3756047-1-pasha.tatashin@soleen.com/ This patch (of 6): Compared with PxD_PAGE_SIZE, which is defined and used only on X86, PxD_SIZE is more common in each architecture. Therefore, it is more reasonable to use PxD_SIZE instead of PxD_PAGE_SIZE in page_table_check.c. At the same time, it is easier to support page table check in other architectures. The substitution has no functional impact on the x86. Link: https://lkml.kernel.org/r/20220507110114.4128854-1-tongtiangen@huawei.com Link: https://lkml.kernel.org/r/20220507110114.4128854-2-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Suggested-by: Anshuman Khandual Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/page_table_check.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2458281bff89..eb0d0b71cdf6 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -177,7 +177,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, if (pmd_user_accessible_page(pmd)) { page_table_check_clear(mm, addr, pmd_pfn(pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT); + PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); @@ -190,7 +190,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, if (pud_user_accessible_page(pud)) { page_table_check_clear(mm, addr, pud_pfn(pud), - PUD_PAGE_SIZE >> PAGE_SHIFT); + PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); @@ -219,7 +219,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(mm, addr, pmd_pfn(pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT, + PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } @@ -234,7 +234,7 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(mm, addr, pud_pfn(pud), - PUD_PAGE_SIZE >> PAGE_SHIFT, + PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } -- cgit v1.2.3 From e5a554014618308f046af99ab9c950165ed6cb11 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: move pxx_user_accessible_page into x86 The pxx_user_accessible_page() checks the PTE bit, it's architecture-specific code, move them into x86's pgtable.h. These helpers are being moved out to make the page table check framework platform independent. Link: https://lkml.kernel.org/r/20220507110114.4128854-3-tongtiangen@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ mm/page_table_check.c | 17 ----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0821f87d495f..46fa65d818bd 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1447,6 +1447,23 @@ static inline bool arch_faults_on_old_pte(void) return false; } +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER); +} +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c index eb0d0b71cdf6..3692bea2ea2c 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -52,23 +52,6 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) return (void *)(page_ext) + page_table_check_ops.offset; } -static inline bool pte_user_accessible_page(pte_t pte) -{ - return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); -} - -static inline bool pmd_user_accessible_page(pmd_t pmd) -{ - return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && - (pmd_val(pmd) & _PAGE_USER); -} - -static inline bool pud_user_accessible_page(pud_t pud) -{ - return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && - (pud_val(pud) & _PAGE_USER); -} - /* * An enty is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. -- cgit v1.2.3 From de8c8e52836d0082188508548d0b939f49f7f0e6 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: add hooks to public helpers Move ptep_clear() to the include/linux/pgtable.h and add page table check relate hooks to some helpers, it's prepare for support page table check feature on new architecture. Optimize the implementation of ptep_clear(), page table hooks added page table check stubs, the interface control should be at stubs, there is no rationale for doing a IS_ENABLED() check here. For architectures that do not enable CONFIG_PAGE_TABLE_CHECK, they will call a fallback page table check stubs[1] when getting their page table helpers[2] in include/linux/pgtable.h. [1] page table check stubs defined in include/linux/page_table_check.h [2] ptep_clear() ptep_get_and_clear() pmdp_huge_get_and_clear() pudp_huge_get_and_clear() Link: https://lkml.kernel.org/r/20220507110114.4128854-4-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Cc: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 10 ---------- include/linux/pgtable.h | 23 +++++++++++++++-------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 46fa65d818bd..44e2d6f1dbaa 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1072,16 +1072,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, return pte; } -#define __HAVE_ARCH_PTEP_CLEAR -static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) - ptep_get_and_clear(mm, addr, ptep); - else - pte_clear(mm, addr, ptep); -} - #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 39fa93e94b80..e0a449b477c5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -12,6 +12,7 @@ #include #include #include +#include #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS @@ -259,14 +260,6 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif -#ifndef __HAVE_ARCH_PTEP_CLEAR -static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_clear(mm, addr, ptep); -} -#endif - #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, @@ -274,10 +267,19 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = *ptep; pte_clear(mm, address, ptep); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + ptep_get_and_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) { @@ -347,7 +349,10 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t *pmdp) { pmd_t pmd = *pmdp; + pmd_clear(pmdp); + page_table_check_pmd_clear(mm, address, pmd); + return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ @@ -359,6 +364,8 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); + page_table_check_pud_clear(mm, address, pud); + return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ -- cgit v1.2.3 From 2e7dc2b632a324c670ff11dd6c84d711043c683f Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: remove __HAVE_ARCH_PTEP_CLEAR in pgtable.h Currently, there is no architecture definition __HAVE_ARCH_PTEP_CLEAR, Generic ptep_clear() is the only definition for all architecture, So drop the "#ifndef __HAVE_ARCH_PTEP_CLEAR". Link: https://lkml.kernel.org/r/20220507110114.4128854-5-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Suggested-by: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Pasha Tatashin Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e0a449b477c5..ecb4c762d760 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -272,13 +272,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #endif -#ifndef __HAVE_ARCH_PTEP_CLEAR static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_get_and_clear(mm, addr, ptep); } -#endif #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) -- cgit v1.2.3 From 42b2547137f5c974bb1bfd657c869fe96b96d86f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 12 May 2022 20:23:06 -0700 Subject: arm64/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table check") , enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on arm64. Add additional page table check stubs for page table helpers, these stubs can be used to check the existing page table entries. Link: https://lkml.kernel.org/r/20220507110114.4128854-6-tongtiangen@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Tong Tiangen Reviewed-by: Pasha Tatashin Acked-by: Catalin Marinas Cc: Anshuman Khandual Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 61 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cc0498652bae..ec97d3e17e88 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -92,6 +92,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8ebf1cec5d90..4e61cde27f9f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE @@ -96,6 +97,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_young(pte) (!!(pte_val(pte) & PTE_AF)) #define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL)) #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) +#define pte_user(pte) (!!(pte_val(pte) & PTE_USER)) #define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN)) #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) #define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP)) @@ -312,8 +314,8 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep, __func__, pte_val(old_pte), pte_val(pte)); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) { if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) __sync_icache_dcache(pte); @@ -343,6 +345,13 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + page_table_check_pte_set(mm, addr, ptep, pte); + return __set_pte_at(mm, addr, ptep, pte); +} + /* * Huge pte definitions. */ @@ -454,6 +463,8 @@ static inline int pmd_trans_huge(pmd_t pmd) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) +#define pmd_user(pmd) pte_user(pmd_pte(pmd)) +#define pmd_user_exec(pmd) pte_user_exec(pmd_pte(pmd)) #define pmd_cont(pmd) pte_cont(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) @@ -501,8 +512,19 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) -#define set_pud_at(mm, addr, pudp, pud) set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)) +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + page_table_check_pmd_set(mm, addr, pmdp, pmd); + return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); +} + +static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(mm, addr, pudp, pud); + return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); +} #define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) #define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) @@ -643,6 +665,24 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_present(pud) pte_present(pud_pte(pud)) #define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) +#define pud_user(pud) pte_user(pud_pte(pud)) + +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) +{ + return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte)); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_present(pud) && pud_user(pud); +} +#endif static inline void set_pud(pud_t *pudp, pud_t pud) { @@ -876,7 +916,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - return __pte(xchg_relaxed(&pte_val(*ptep), 0)); + pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); + + page_table_check_pte_clear(mm, address, pte); + + return pte; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -884,7 +928,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp)); + pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); + + page_table_check_pmd_clear(mm, address, pmd); + + return pmd; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -918,6 +966,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif -- cgit v1.2.3 From 3fee229a8eb936b96933c6b2cd02d2e87a4cc997 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv. Add additional page table check stubs for page table helpers, these stubs can be used to check the existing page table entries. Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Reviewed-by: Pasha Tatashin Cc: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable.h | 71 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 00fd9c548f26..a2285067f2fc 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -38,6 +38,7 @@ config RISCV select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGETLBFS if MMU + select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 046b44225623..62e733c85836 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -114,6 +114,8 @@ #include #endif /* CONFIG_64BIT */ +#include + #ifdef CONFIG_XIP_KERNEL #define XIP_FIXUP(addr) ({ \ uintptr_t __a = (uintptr_t)(addr); \ @@ -315,6 +317,11 @@ static inline int pte_exec(pte_t pte) return pte_val(pte) & _PAGE_EXEC; } +static inline int pte_user(pte_t pte) +{ + return pte_val(pte) & _PAGE_USER; +} + static inline int pte_huge(pte_t pte) { return pte_present(pte) && (pte_val(pte) & _PAGE_LEAF); @@ -446,7 +453,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) void flush_icache_pte(pte_t pte); -static inline void set_pte_at(struct mm_struct *mm, +static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { if (pte_present(pteval) && pte_exec(pteval)) @@ -455,10 +462,17 @@ static inline void set_pte_at(struct mm_struct *mm, set_pte(ptep, pteval); } +static inline void set_pte_at(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pteval) +{ + page_table_check_pte_set(mm, addr, ptep, pteval); + __set_pte_at(mm, addr, ptep, pteval); +} + static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - set_pte_at(mm, addr, ptep, __pte(0)); + __set_pte_at(mm, addr, ptep, __pte(0)); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS @@ -479,7 +493,11 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - return __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); + pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); + + page_table_check_pte_clear(mm, address, pte); + + return pte; } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG @@ -546,6 +564,13 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT); } +#define __pud_to_phys(pud) (pud_val(pud) >> _PAGE_PFN_SHIFT << PAGE_SHIFT) + +static inline unsigned long pud_pfn(pud_t pud) +{ + return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT); +} + static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); @@ -567,6 +592,11 @@ static inline int pmd_young(pmd_t pmd) return pte_young(pmd_pte(pmd)); } +static inline int pmd_user(pmd_t pmd) +{ + return pte_user(pmd_pte(pmd)); +} + static inline pmd_t pmd_mkold(pmd_t pmd) { return pte_pmd(pte_mkold(pmd_pte(pmd))); @@ -600,15 +630,39 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - return set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); + page_table_check_pmd_set(mm, addr, pmdp, pmd); + return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); +} + +static inline int pud_user(pud_t pud) +{ + return pte_user(pud_pte(pud)); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - return set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); + page_table_check_pud_set(mm, addr, pudp, pud); + return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); +} + +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) +{ + return pte_present(pte) && pte_user(pte); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && pmd_user(pmd); } +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && pud_user(pud); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { @@ -634,7 +688,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp)); + pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); + + page_table_check_pmd_clear(mm, address, pmd); + + return pmd; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -648,6 +706,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From c8db8c2628afc7088a43de3f7cfbcc2ef1f182f7 Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Thu, 12 May 2022 20:23:07 -0700 Subject: mm: functions may simplify the use of return values p4d_clear_huge may be optimized for void return type and function usage. vunmap_p4d_range function saves a few steps here. Link: https://lkml.kernel.org/r/20220507150630.90399-1-kunyu@nfschina.com Signed-off-by: Li kunyu Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/mm/pgtable.c | 3 +-- include/linux/pgtable.h | 12 +++--------- mm/vmalloc.c | 7 ++----- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f16059e9a85e..a932d7712d85 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -686,9 +686,8 @@ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) * * No 512GB pages yet -- always return 0 */ -int p4d_clear_huge(p4d_t *p4d) +void p4d_clear_huge(p4d_t *p4d) { - return 0; } #endif diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ecb4c762d760..3cdc16cfd867 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1448,16 +1448,13 @@ static inline int pmd_protnone(pmd_t pmd) #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); -int p4d_clear_huge(p4d_t *p4d); +void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } -static inline int p4d_clear_huge(p4d_t *p4d) -{ - return 0; -} +static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); @@ -1480,10 +1477,7 @@ static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } -static inline int p4d_clear_huge(p4d_t *p4d) -{ - return 0; -} +static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c3af9b28e54f..5b59ccafcd46 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -389,18 +389,15 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, { p4d_t *p4d; unsigned long next; - int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - cleared = p4d_clear_huge(p4d); - if (cleared || p4d_bad(*p4d)) + p4d_clear_huge(p4d); + if (p4d_bad(*p4d)) *mask |= PGTBL_P4D_MODIFIED; - if (cleared) - continue; if (p4d_none_or_clear_bad(p4d)) continue; vunmap_pud_range(p4d, addr, next, mask); -- cgit v1.2.3 From 2e14a8d3bbccf22a807c012ff6e7c2f307600e4a Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 12 May 2022 20:23:07 -0700 Subject: mm/damon/reclaim: use resource_size function on resource object Fix the following coccicheck warnings: ./mm/damon/reclaim.c:241:30-33: WARNING: Suspicious code. resource_size is maybe missing with res. Link: https://lkml.kernel.org/r/20220507032512.129598-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jiapeng Chong Reported-by: Abaci Robot Reviewed-by: SeongJae Park Cc: "Boehme, Markus" Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index f37c5d4b27fa..8efbfb24f3a1 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -238,7 +238,7 @@ static int walk_system_ram(struct resource *res, void *arg) { struct damon_reclaim_ram_walk_arg *a = arg; - if (a->end - a->start < res->end - res->start) { + if (a->end - a->start < resource_size(res)) { a->start = res->start; a->end = res->end; } -- cgit v1.2.3 From d1ed51fcdbd69be3729f6e249b61cc73fb3b2dd8 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Thu, 12 May 2022 20:23:07 -0700 Subject: docs: vm/page_owner: tweak literal block in STANDARD FORMAT SPECIFIERS A semantic conflict between commit 5603f9bdea68 ("docs: vm/page_owner: use literal blocks for param description") and a change queued for v5.19 authored by Jiajian Ye ("tools/vm/page_owner_sort.c: support sorting blocks by multiple keys") results in a warning from "make htmldocs" saying: [...]/vm/page_owner.rst:176: WARNING: Literal block expected; none found. This is because a literal block in ReST ends at a line which has the same indent as the paragraph preceding it. In this case the one with no indent. Indent the two "For --xxxx option:" lines by two columns and make the whole section a literal block. While at it, fix indents by white spaces of "ator" keys. Link: https://lkml.kernel.org/r/fdfecc82-d41e-6d8a-738d-4beb6faa27fb@gmail.com Signed-of-by: Akira Yokosawa Reported-by: Shenghong Han Cc: Jiajian Ye Cc: Chongxi Zhao Cc: Yinan Zhang Cc: Yixuan Cao Cc: Yongqiang Liu Cc: Yuhong Feng Cc: Haowen Bai Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/vm/page_owner.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 25622c715823..f5c954afe97c 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -173,7 +173,7 @@ STANDARD FORMAT SPECIFIERS ========================== :: -For --sort option: + For --sort option: KEY LONG DESCRIPTION p pid process ID @@ -183,9 +183,9 @@ For --sort option: T txt full text of block ft free_ts timestamp of the page when it was released at alloc_ts timestamp of the page when it was allocated - ator allocator memory allocator for pages + ator allocator memory allocator for pages -For --curl option: + For --curl option: KEY LONG DESCRIPTION p pid process ID @@ -193,4 +193,4 @@ For --curl option: n name task command name f free whether the page has been released or not st stacktrace stack trace of the page allocation - ator allocator memory allocator for pages + ator allocator memory allocator for pages -- cgit v1.2.3 From f67bed134a053663852a1a3ab1b3223bfc2104a2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 12 May 2022 20:23:07 -0700 Subject: percpu: improve percpu_alloc_percpu event trace Add call_site, bytes_alloc and gfp_flags fields to the output of the percpu_alloc_percpu ftrace event: mkdir-4393 [001] 169.334788: percpu_alloc_percpu: call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8 base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448 gfp_flags=GFP_KERNEL_ACCOUNT This is required to track memcg-accounted percpu allocations. Link: https://lkml.kernel.org/r/a07be858-c8a3-7851-9086-e3262cbcf707@openvz.org Signed-off-by: Vasily Averin Acked-by: Roman Gushchin Cc: Shakeel Butt Cc: Steven Rostedt Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Michal Hocko Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton --- include/trace/events/percpu.h | 23 +++++++++++++++++------ mm/percpu-internal.h | 8 ++++---- mm/percpu.c | 5 +++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h index df112a64f6c9..e989cefc0def 100644 --- a/include/trace/events/percpu.h +++ b/include/trace/events/percpu.h @@ -6,15 +6,20 @@ #define _TRACE_PERCPU_H #include +#include TRACE_EVENT(percpu_alloc_percpu, - TP_PROTO(bool reserved, bool is_atomic, size_t size, - size_t align, void *base_addr, int off, void __percpu *ptr), + TP_PROTO(unsigned long call_site, + bool reserved, bool is_atomic, size_t size, + size_t align, void *base_addr, int off, + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), - TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr), + TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off, + ptr, bytes_alloc, gfp_flags), TP_STRUCT__entry( + __field( unsigned long, call_site ) __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) @@ -22,9 +27,11 @@ TRACE_EVENT(percpu_alloc_percpu, __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) ), - TP_fast_assign( + __entry->call_site = call_site; __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; @@ -32,12 +39,16 @@ TRACE_EVENT(percpu_alloc_percpu, __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; ), - TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p", + TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s", + (void *)__entry->call_site, __entry->reserved, __entry->is_atomic, __entry->size, __entry->align, - __entry->base_addr, __entry->off, __entry->ptr) + __entry->base_addr, __entry->off, __entry->ptr, + __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(percpu_free_percpu, diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 411d1593ef23..70b1ea23f4d2 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -113,7 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } -#ifdef CONFIG_MEMCG_KMEM /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes @@ -123,13 +122,14 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) */ static inline size_t pcpu_obj_full_size(size_t size) { - size_t extra_size; + size_t extra_size = 0; - extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); +#ifdef CONFIG_MEMCG_KMEM + extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); +#endif return size * num_possible_cpus() + extra_size; } -#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_PERCPU_STATS diff --git a/mm/percpu.c b/mm/percpu.c index ea28db283044..3633eeefaa0d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1884,8 +1884,9 @@ area_found: ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); - trace_percpu_alloc_percpu(reserved, is_atomic, size, align, - chunk->base_addr, off, ptr); + trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align, + chunk->base_addr, off, ptr, + pcpu_obj_full_size(size), gfp); pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); -- cgit v1.2.3 From e7be8d1dd983156bbdd22c0319b71119a8fbb697 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Thu, 12 May 2022 20:23:07 -0700 Subject: zram: remove double compression logic The 2nd trial allocation under per-cpu presumption has been used to prevent regression of allocation failure. However, it makes trouble for maintenance without significant benefit. The slowpath branch is executed extremely rarely: getting there is problematic. Therefore, we delete this branch. Since b09ab054b69b ("zram: support BDI_CAP_STABLE_WRITES"), zram has used QUEUE_FLAG_STABLE_WRITES to prevent buffer change between 1st and 2nd memory allocations. Since we remove second trial memory allocation logic, we could remove the STABLE_WRITES flag because there is no change buffer to be modified under us. Link: https://lkml.kernel.org/r/20220505094443.11728-1-avromanov@sberdevices.ru Signed-off-by: Alexey Romanov Signed-off-by: Dmitry Rokosov Acked-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 42 ++++++++++-------------------------------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8562a7cce558..7e849653775c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1144,15 +1144,14 @@ static ssize_t bd_stat_show(struct device *dev, static ssize_t debug_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - int version = 1; + int version = 2; struct zram *zram = dev_to_zram(dev); ssize_t ret; down_read(&zram->init_lock); ret = scnprintf(buf, PAGE_SIZE, - "version: %d\n%8llu %8llu\n", + "version: %d\n%8llu\n", version, - (u64)atomic64_read(&zram->stats.writestall), (u64)atomic64_read(&zram->stats.miss_free)); up_read(&zram->init_lock); @@ -1368,7 +1367,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } kunmap_atomic(mem); -compress_again: zstrm = zcomp_stream_get(zram->comp); src = kmap_atomic(page); ret = zcomp_compress(zstrm, src, &comp_len); @@ -1377,39 +1375,20 @@ compress_again: if (unlikely(ret)) { zcomp_stream_put(zram->comp); pr_err("Compression failed! err=%d\n", ret); - zs_free(zram->mem_pool, handle); return ret; } if (comp_len >= huge_class_size) comp_len = PAGE_SIZE; - /* - * handle allocation has 2 paths: - * a) fast path is executed with preemption disabled (for - * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, - * since we can't sleep; - * b) slow path enables preemption and attempts to allocate - * the page with __GFP_DIRECT_RECLAIM bit set. we have to - * put per-cpu compression stream and, thus, to re-do - * the compression once handle is allocated. - * - * if we have a 'non-null' handle here then we are coming - * from the slow path and handle has already been allocated. - */ - if (!handle) - handle = zs_malloc(zram->mem_pool, comp_len, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); - if (!handle) { + + handle = zs_malloc(zram->mem_pool, comp_len, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM | + __GFP_MOVABLE); + + if (unlikely(!handle)) { zcomp_stream_put(zram->comp); - atomic64_inc(&zram->stats.writestall); - handle = zs_malloc(zram->mem_pool, comp_len, - GFP_NOIO | __GFP_HIGHMEM | - __GFP_MOVABLE); - if (handle) - goto compress_again; return -ENOMEM; } @@ -1967,7 +1946,6 @@ static int zram_add(void) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828..158c91e54850 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -81,7 +81,6 @@ struct zram_stats { atomic64_t huge_pages_since; /* no. of huge pages since zram set up */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ - atomic64_t writestall; /* no. of write slow paths */ atomic64_t miss_free; /* no. of missed free */ #ifdef CONFIG_ZRAM_WRITEBACK atomic64_t bd_count; /* no. of pages in backing device */ -- cgit v1.2.3 From fe573327ffb1deba802c91dd1d4ff41dafa97a0e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 12 May 2022 20:23:08 -0700 Subject: tracing: incorrect gfp_t conversion Fixes the following sparse warnings: include/trace/events/*: sparse: cast to restricted gfp_t include/trace/events/*: sparse: restricted gfp_t degrades to integer gfp_t type is bitwise and requires __force attributes for any casts. Link: https://lkml.kernel.org/r/331d88fe-f4f7-657c-02a2-d977f15fbff6@openvz.org Signed-off-by: Vasily Averin Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- include/trace/events/btrfs.h | 4 +- include/trace/events/compaction.h | 4 +- include/trace/events/kmem.h | 12 +++--- include/trace/events/mmflags.h | 78 +++++++++++++++++++-------------------- include/trace/events/vmscan.h | 16 ++++---- mm/compaction.c | 2 +- 7 files changed, 59 insertions(+), 59 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2a08a3c4ba95..2d2ccae933c2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -367,7 +367,7 @@ static inline int gfp_migratetype(const gfp_t gfp_flags) return MIGRATE_UNMOVABLE; /* Group based on mobility */ - return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; + return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f068ff30d654..ed92c80331ea 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1344,13 +1344,13 @@ TRACE_EVENT(alloc_extent_state, TP_STRUCT__entry( __field(const struct extent_state *, state) - __field(gfp_t, mask) + __field(unsigned long, mask) __field(const void*, ip) ), TP_fast_assign( __entry->state = state, - __entry->mask = mask, + __entry->mask = (__force unsigned long)mask, __entry->ip = (const void *)IP ), diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index c6d5d70dc7a5..3313eb83c117 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -162,13 +162,13 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, TP_STRUCT__entry( __field(int, order) - __field(gfp_t, gfp_mask) + __field(unsigned long, gfp_mask) __field(int, prio) ), TP_fast_assign( __entry->order = order; - __entry->gfp_mask = gfp_mask; + __entry->gfp_mask = (__force unsigned long)gfp_mask; __entry->prio = prio; ), diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index ddc8c944f417..71c141804222 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -24,7 +24,7 @@ DECLARE_EVENT_CLASS(kmem_alloc, __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) + __field( unsigned long, gfp_flags ) ), TP_fast_assign( @@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(kmem_alloc, __entry->ptr = ptr; __entry->bytes_req = bytes_req; __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", @@ -75,7 +75,7 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) + __field( unsigned long, gfp_flags ) __field( int, node ) ), @@ -84,7 +84,7 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __entry->ptr = ptr; __entry->bytes_req = bytes_req; __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; ), @@ -208,14 +208,14 @@ TRACE_EVENT(mm_page_alloc, TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) - __field( gfp_t, gfp_flags ) + __field( unsigned long, gfp_flags ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->migratetype = migratetype; ), diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 6532119a6bf1..fcb1ffb2cc8d 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -14,48 +14,48 @@ */ #define __def_gfpflag_names \ - {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ - {(unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \ - {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ - {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ - {(unsigned long)GFP_USER, "GFP_USER"}, \ - {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \ - {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ - {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \ - {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ - {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ - {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \ - {(unsigned long)GFP_DMA, "GFP_DMA"}, \ - {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ - {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ - {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ - {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ - {(unsigned long)__GFP_IO, "__GFP_IO"}, \ - {(unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ - {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ - {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ - {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ - {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \ - {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \ - {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \ - {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \ - {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \ - {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \ - {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ - {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ - {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ - {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ - {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ + {(__force unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ + {(__force unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \ + {(__force unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ + {(__force unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ + {(__force unsigned long)GFP_USER, "GFP_USER"}, \ + {(__force unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \ + {(__force unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ + {(__force unsigned long)GFP_NOFS, "GFP_NOFS"}, \ + {(__force unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ + {(__force unsigned long)GFP_NOIO, "GFP_NOIO"}, \ + {(__force unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \ + {(__force unsigned long)GFP_DMA, "GFP_DMA"}, \ + {(__force unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ + {(__force unsigned long)GFP_DMA32, "GFP_DMA32"}, \ + {(__force unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ + {(__force unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ + {(__force unsigned long)__GFP_IO, "__GFP_IO"}, \ + {(__force unsigned long)__GFP_FS, "__GFP_FS"}, \ + {(__force unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ + {(__force unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ + {(__force unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ + {(__force unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ + {(__force unsigned long)__GFP_COMP, "__GFP_COMP"}, \ + {(__force unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \ + {(__force unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \ + {(__force unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \ + {(__force unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \ + {(__force unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \ + {(__force unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ + {(__force unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ + {(__force unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ + {(__force unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ + {(__force unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ + {(__force unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ + {(__force unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ + {(__force unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ #ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ - {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ - {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ - {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} + {(__force unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ + {(__force unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ + {(__force unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else #define __def_gfpflag_names_kasan #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index de136dbd623a..408c86244d64 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -96,14 +96,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, __field( int, nid ) __field( int, zid ) __field( int, order ) - __field( gfp_t, gfp_flags ) + __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", @@ -120,12 +120,12 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, TP_STRUCT__entry( __field( int, order ) - __field( gfp_t, gfp_flags ) + __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->order = order; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("order=%d gfp_flags=%s", @@ -210,7 +210,7 @@ TRACE_EVENT(mm_shrink_slab_start, __field(void *, shrink) __field(int, nid) __field(long, nr_objects_to_shrink) - __field(gfp_t, gfp_flags) + __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) @@ -222,7 +222,7 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->shrink = shr->scan_objects; __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; - __entry->gfp_flags = sc->gfp_mask; + __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; @@ -446,13 +446,13 @@ TRACE_EVENT(mm_vmscan_node_reclaim_begin, TP_STRUCT__entry( __field(int, nid) __field(int, order) - __field(gfp_t, gfp_flags) + __field(unsigned long, gfp_flags) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", diff --git a/mm/compaction.c b/mm/compaction.c index 65970107b789..0e0ef9e44530 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2569,7 +2569,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = gfp_mask & __GFP_IO; + int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; -- cgit v1.2.3 From 3f80492001aa64ac585016050ace8680611c2e20 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 12 May 2022 20:23:08 -0700 Subject: mm/vmalloc: use raw_cpu_ptr() for vmap_block_queue access The per-CPU resource vmap_block_queue is accessed via get_cpu_var(). That macro disables preemption and then loads the pointer from the current CPU. This doesn't work on PREEMPT_RT because a spinlock_t is later accessed within the preempt-disable section. There is no need to disable preemption while accessing the per-CPU struct vmap_block_queue because the list is protected with a spinlock_t. The per-CPU struct is also accessed cross-CPU in purge_fragmented_blocks(). It is possible that by using raw_cpu_ptr() the code migrates to another CPU and uses struct from another CPU. This is fine because the list is locked and the locked section is very short. Use raw_cpu_ptr() to access vmap_block_queue. Link: https://lkml.kernel.org/r/YnKx3duAB53P7ojN@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Cc: Uladzislau Rezki (Sony) Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/vmalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5b59ccafcd46..07db42455dd4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1938,11 +1938,10 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(err); } - vbq = &get_cpu_var(vmap_block_queue); + vbq = raw_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_block_queue); return vaddr; } @@ -2021,7 +2020,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) order = get_order(size); rcu_read_lock(); - vbq = &get_cpu_var(vmap_block_queue); + vbq = raw_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; @@ -2044,7 +2043,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) break; } - put_cpu_var(vmap_block_queue); rcu_read_unlock(); /* Allocate new block if nothing was found */ -- cgit v1.2.3 From fc0e5b91dfe4420a3f26bde36ee232aa8c6ded5d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 12 May 2022 20:23:08 -0700 Subject: kasan: clean up comments in internal kasan.h Clean up comments in mm/kasan/kasan.h: clarify, unify styles, fix punctuation, etc. Link: https://lkml.kernel.org/r/a0680ff30035b56cb7bdd5f59fd400e71712ceb5.1652111464.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 74 +++++++++++++++++++++++++------------------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b01b4bbe0409..fed4f7a00d33 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -42,6 +42,7 @@ static inline bool kasan_sync_fault_possible(void) { return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } + #else static inline bool kasan_stack_collection_enabled(void) @@ -73,47 +74,41 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) #ifdef CONFIG_KASAN_GENERIC -#define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ -#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ -#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ +#define KASAN_FREE_PAGE 0xFF /* freed page */ +#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */ +#define KASAN_KMALLOC_REDZONE 0xFC /* redzone for slab object */ +#define KASAN_KMALLOC_FREE 0xFB /* freed slab object */ +#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */ +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */ #endif #ifdef CONFIG_KASAN_GENERIC -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ +#define KASAN_KMALLOC_FREETRACK 0xFA /* freed slab object with free track */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -/* - * Stack redzone shadow values - * (Those are compiler's ABI, don't change them) - */ +/* Stack redzone shadow values. Compiler ABI, do not change. */ #define KASAN_STACK_LEFT 0xF1 #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 -/* - * alloca redzone shadow values - */ +/* alloca redzone shadow values. */ #define KASAN_ALLOCA_LEFT 0xCA #define KASAN_ALLOCA_RIGHT 0xCB +/* alloca redzone size. Compiler ABI, do not change. */ #define KASAN_ALLOCA_REDZONE_SIZE 32 -/* - * Stack frame marker (compiler ABI). - */ +/* Stack frame marker. Compiler ABI, do not change. */ #define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3 -/* Don't break randconfig/all*config builds */ +/* Dummy value to avoid breaking randconfig/all*config builds. */ #ifndef KASAN_ABI_VERSION #define KASAN_ABI_VERSION 1 #endif @@ -141,21 +136,21 @@ struct kasan_report_info { unsigned long ip; }; -/* The layout of struct dictated by compiler */ +/* Do not change the struct layout: compiler ABI. */ struct kasan_source_location { const char *filename; int line_no; int column_no; }; -/* The layout of struct dictated by compiler */ +/* Do not change the struct layout: compiler ABI. */ struct kasan_global { const void *beg; /* Address of the beginning of the global variable. */ size_t size; /* Size of the global variable. */ - size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */ + size_t size_with_redzone; /* Size of the variable + size of the redzone. 32 bytes aligned. */ const void *name; const void *module_name; /* Name of the module where the global variable is declared. */ - unsigned long has_dynamic_init; /* This needed for C++ */ + unsigned long has_dynamic_init; /* This is needed for C++. */ #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif @@ -164,9 +159,7 @@ struct kasan_global { #endif }; -/** - * Structures to keep alloc and free tracks * - */ +/* Structures for keeping alloc and free tracks. */ #define KASAN_STACK_DEPTH 64 @@ -183,11 +176,8 @@ struct kasan_track { struct kasan_alloc_meta { struct kasan_track alloc_track; + /* Generic mode stores free track in kasan_free_meta. */ #ifdef CONFIG_KASAN_GENERIC - /* - * The auxiliary stack is stored into struct kasan_alloc_meta. - * The free stack is stored into struct kasan_free_meta. - */ depot_stack_handle_t aux_stack[2]; #else struct kasan_track free_track[KASAN_NR_FREE_STACKS]; @@ -203,18 +193,18 @@ struct qlist_node { }; /* - * Generic mode either stores free meta in the object itself or in the redzone - * after the object. In the former case free meta offset is 0, in the latter - * case it has some sane value smaller than INT_MAX. Use INT_MAX as free meta - * offset when free meta isn't present. + * Free meta is stored either in the object itself or in the redzone after the + * object. In the former case, free meta offset is 0. In the latter case, the + * offset is between 0 and INT_MAX. INT_MAX marks that free meta is not present. */ #define KASAN_NO_FREE_META INT_MAX +/* + * Free meta is only used by Generic mode while the object is in quarantine. + * After that, slab allocator stores the freelist pointer in the object. + */ struct kasan_free_meta { #ifdef CONFIG_KASAN_GENERIC - /* This field is used while the object is in the quarantine. - * Otherwise it might be used for the allocator freelist. - */ struct qlist_node quarantine_link; struct kasan_track free_track; #endif @@ -417,9 +407,10 @@ static inline void kasan_unpoison(const void *addr, size_t size, bool init) return; /* * Explicitly initialize the memory with the precise object size to - * avoid overwriting the SLAB redzone. This disables initialization in - * the arch code and may thus lead to performance penalty. The penalty - * is accepted since SLAB redzones aren't enabled in production builds. + * avoid overwriting the slab redzone. This disables initialization in + * the arch code and may thus lead to performance penalty. This penalty + * does not affect production builds, as slab redzones are not enabled + * there. */ if (__slub_debug_enabled() && init && ((unsigned long)size & KASAN_GRANULE_MASK)) { @@ -503,8 +494,9 @@ void kasan_restore_multi_shot(bool enabled); /* * Exported functions for interfaces called from assembly or from generated - * code. Declarations here to avoid warning about missing declarations. + * code. Declared here to avoid warnings about missing declarations. */ + asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); void __asan_register_globals(struct kasan_global *globals, size_t size); void __asan_unregister_globals(struct kasan_global *globals, size_t size); @@ -573,4 +565,4 @@ void __hwasan_storeN_noabort(unsigned long addr, size_t size); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); -#endif +#endif /* __MM_KASAN_KASAN_H */ -- cgit v1.2.3 From 83f8e4a8b47012c299f59e301c64763727cc5f81 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 12 May 2022 20:23:08 -0700 Subject: kasan: use tabs to align shadow values Consistently use tabs instead of spaces to shadow value definitions. Link: https://lkml.kernel.org/r/00e7e66b5fc375d58200dc1489949b3edcd096b7.1652111464.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fed4f7a00d33..a60ed636e899 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -74,29 +74,29 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) #ifdef CONFIG_KASAN_GENERIC -#define KASAN_FREE_PAGE 0xFF /* freed page */ -#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */ -#define KASAN_KMALLOC_REDZONE 0xFC /* redzone for slab object */ -#define KASAN_KMALLOC_FREE 0xFB /* freed slab object */ -#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */ +#define KASAN_FREE_PAGE 0xFF /* freed page */ +#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */ +#define KASAN_KMALLOC_REDZONE 0xFC /* redzone for slab object */ +#define KASAN_KMALLOC_FREE 0xFB /* freed slab object */ +#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */ #else -#define KASAN_FREE_PAGE KASAN_TAG_INVALID -#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID -#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */ +#define KASAN_FREE_PAGE KASAN_TAG_INVALID +#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID +#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID +#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */ #endif #ifdef CONFIG_KASAN_GENERIC -#define KASAN_KMALLOC_FREETRACK 0xFA /* freed slab object with free track */ -#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ +#define KASAN_KMALLOC_FREETRACK 0xFA /* freed slab object with free track */ +#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ /* Stack redzone shadow values. Compiler ABI, do not change. */ -#define KASAN_STACK_LEFT 0xF1 -#define KASAN_STACK_MID 0xF2 -#define KASAN_STACK_RIGHT 0xF3 -#define KASAN_STACK_PARTIAL 0xF4 +#define KASAN_STACK_LEFT 0xF1 +#define KASAN_STACK_MID 0xF2 +#define KASAN_STACK_RIGHT 0xF3 +#define KASAN_STACK_PARTIAL 0xF4 /* alloca redzone shadow values. */ #define KASAN_ALLOCA_LEFT 0xCA -- cgit v1.2.3 From 06bc4cf6cdde69079c82330aa8f0939d680b6c84 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 12 May 2022 20:23:08 -0700 Subject: kasan: give better names to shadow values Rename KASAN_KMALLOC_* shadow values to KASAN_SLAB_*, as they are used for all slab allocations, not only for kmalloc. Also rename KASAN_FREE_PAGE to KASAN_PAGE_FREE to be consistent with KASAN_PAGE_REDZONE and KASAN_SLAB_FREE. Link: https://lkml.kernel.org/r/bebcaf4eafdb0cabae0401a69c0af956aa87fcaa.1652111464.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kasan/common.c | 12 ++++++------ mm/kasan/generic.c | 6 +++--- mm/kasan/kasan.h | 14 +++++++------- mm/kasan/quarantine.c | 2 +- mm/kasan/report_generic.c | 8 ++++---- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d9079ec11f31..c40c0e7b3b5f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -117,7 +117,7 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, - KASAN_FREE_PAGE, init); + KASAN_PAGE_FREE, init); } /* @@ -254,7 +254,7 @@ void __kasan_poison_slab(struct slab *slab) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE, false); + KASAN_SLAB_REDZONE, false); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) @@ -265,7 +265,7 @@ void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_REDZONE, false); + KASAN_SLAB_REDZONE, false); } /* @@ -357,7 +357,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, } kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_FREE, init); + KASAN_SLAB_FREE, init); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -414,7 +414,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) if (unlikely(!folio_test_slab(folio))) { if (____kasan_kfree_large(ptr, ip)) return; - kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false); + kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); } else { struct slab *slab = folio_slab(folio); @@ -505,7 +505,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, redzone_end = round_up((unsigned long)(object + cache->object_size), KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE, false); + KASAN_SLAB_REDZONE, false); /* * Save alloc info (if possible) for kmalloc() allocations. diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index a25ad4090615..437fcc7e77cf 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -369,14 +369,14 @@ void kasan_set_free_info(struct kmem_cache *cache, kasan_set_track(&free_meta->free_track, GFP_NOWAIT); /* The object was freed and has free track set. */ - *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK; + *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK) + if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) return NULL; - /* Free meta must be present with KASAN_KMALLOC_FREETRACK. */ + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ return &kasan_get_free_meta(cache, object)->free_track; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index a60ed636e899..610d60d6e5b8 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -74,22 +74,22 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) #ifdef CONFIG_KASAN_GENERIC -#define KASAN_FREE_PAGE 0xFF /* freed page */ +#define KASAN_PAGE_FREE 0xFF /* freed page */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */ -#define KASAN_KMALLOC_REDZONE 0xFC /* redzone for slab object */ -#define KASAN_KMALLOC_FREE 0xFB /* freed slab object */ +#define KASAN_SLAB_REDZONE 0xFC /* redzone for slab object */ +#define KASAN_SLAB_FREE 0xFB /* freed slab object */ #define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */ #else -#define KASAN_FREE_PAGE KASAN_TAG_INVALID +#define KASAN_PAGE_FREE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID -#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#define KASAN_SLAB_REDZONE KASAN_TAG_INVALID +#define KASAN_SLAB_FREE KASAN_TAG_INVALID #define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */ #endif #ifdef CONFIG_KASAN_GENERIC -#define KASAN_KMALLOC_FREETRACK 0xFA /* freed slab object with free track */ +#define KASAN_SLAB_FREETRACK 0xFA /* freed slab object with free track */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ /* Stack redzone shadow values. Compiler ABI, do not change. */ diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 8ef098f52622..75585077eb6d 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -163,7 +163,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. */ - *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE; + *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; ___cache_free(cache, object, _THIS_IP_); diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index efc5e79a103f..6689fb9a919b 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -66,7 +66,7 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info) bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: - case KASAN_KMALLOC_REDZONE: + case KASAN_SLAB_REDZONE: bug_type = "slab-out-of-bounds"; break; case KASAN_GLOBAL_REDZONE: @@ -78,9 +78,9 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info) case KASAN_STACK_PARTIAL: bug_type = "stack-out-of-bounds"; break; - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - case KASAN_KMALLOC_FREETRACK: + case KASAN_PAGE_FREE: + case KASAN_SLAB_FREE: + case KASAN_SLAB_FREETRACK: bug_type = "use-after-free"; break; case KASAN_ALLOCA_LEFT: -- cgit v1.2.3 From c2ec0c8f687711e671d865b0a36344440c1e8b1f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 12 May 2022 20:23:09 -0700 Subject: kasan: update documentation Do assorted clean-ups and improvements to KASAN documentation, including: - Describe each mode in a dedicated paragraph. - Split out a Support section that describes in details which compilers, architectures and memory types each mode requires/supports. - Capitalize the first letter in the names of each KASAN mode. [andreyknvl@google.com: rewording, per Marco] Link: https://lkml.kernel.org/r/896b2d914d6b50d677fd7b38f76967cc705c01ba.1652203271.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/5bd58ebebf066593ce0e1d265d60278b5f5a1874.1652123204.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 150 +++++++++++++++++++++++--------------- 1 file changed, 90 insertions(+), 60 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 7614a1fc30fa..2ed0b77d1db6 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -4,39 +4,76 @@ The Kernel Address Sanitizer (KASAN) Overview -------- -KernelAddressSANitizer (KASAN) is a dynamic memory safety error detector -designed to find out-of-bound and use-after-free bugs. KASAN has three modes: +Kernel Address Sanitizer (KASAN) is a dynamic memory safety error detector +designed to find out-of-bounds and use-after-free bugs. -1. generic KASAN (similar to userspace ASan), -2. software tag-based KASAN (similar to userspace HWASan), -3. hardware tag-based KASAN (based on hardware memory tagging). +KASAN has three modes: -Generic KASAN is mainly used for debugging due to a large memory overhead. -Software tag-based KASAN can be used for dogfood testing as it has a lower -memory overhead that allows using it with real workloads. Hardware tag-based -KASAN comes with low memory and performance overheads and, therefore, can be -used in production. Either as an in-field memory bug detector or as a security -mitigation. +1. Generic KASAN +2. Software Tag-Based KASAN +3. Hardware Tag-Based KASAN -Software KASAN modes (#1 and #2) use compile-time instrumentation to insert -validity checks before every memory access and, therefore, require a compiler -version that supports that. +Generic KASAN, enabled with CONFIG_KASAN_GENERIC, is the mode intended for +debugging, similar to userspace ASan. This mode is supported on many CPU +architectures, but it has significant performance and memory overheads. -Generic KASAN is supported in GCC and Clang. With GCC, it requires version -8.3.0 or later. Any supported Clang version is compatible, but detection of -out-of-bounds accesses for global variables is only supported since Clang 11. +Software Tag-Based KASAN or SW_TAGS KASAN, enabled with CONFIG_KASAN_SW_TAGS, +can be used for both debugging and dogfood testing, similar to userspace HWASan. +This mode is only supported for arm64, but its moderate memory overhead allows +using it for testing on memory-restricted devices with real workloads. -Software tag-based KASAN mode is only supported in Clang. +Hardware Tag-Based KASAN or HW_TAGS KASAN, enabled with CONFIG_KASAN_HW_TAGS, +is the mode intended to be used as an in-field memory bug detector or as a +security mitigation. This mode only works on arm64 CPUs that support MTE +(Memory Tagging Extension), but it has low memory and performance overheads and +thus can be used in production. -The hardware KASAN mode (#3) relies on hardware to perform the checks but -still requires a compiler version that supports memory tagging instructions. -This mode is supported in GCC 10+ and Clang 12+. +For details about the memory and performance impact of each KASAN mode, see the +descriptions of the corresponding Kconfig options. -Both software KASAN modes work with SLUB and SLAB memory allocators, -while the hardware tag-based KASAN currently only supports SLUB. +The Generic and the Software Tag-Based modes are commonly referred to as the +software modes. The Software Tag-Based and the Hardware Tag-Based modes are +referred to as the tag-based modes. -Currently, generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390, -and riscv architectures, and tag-based KASAN modes are supported only for arm64. +Support +------- + +Architectures +~~~~~~~~~~~~~ + +Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and +xtensa, and the tag-based KASAN modes are supported only on arm64. + +Compilers +~~~~~~~~~ + +Software KASAN modes use compile-time instrumentation to insert validity checks +before every memory access and thus require a compiler version that provides +support for that. The Hardware Tag-Based mode relies on hardware to perform +these checks but still requires a compiler version that supports the memory +tagging instructions. + +Generic KASAN requires GCC version 8.3.0 or later +or any Clang version supported by the kernel. + +Software Tag-Based KASAN requires GCC 11+ +or any Clang version supported by the kernel. + +Hardware Tag-Based KASAN requires GCC 10+ or Clang 12+. + +Memory types +~~~~~~~~~~~~ + +Generic KASAN supports finding bugs in all of slab, page_alloc, vmap, vmalloc, +stack, and global memory. + +Software Tag-Based KASAN supports slab, page_alloc, vmalloc, and stack memory. + +Hardware Tag-Based KASAN supports slab, page_alloc, and non-executable vmalloc +memory. + +For slab, both software KASAN modes support SLUB and SLAB allocators, while +Hardware Tag-Based KASAN only supports SLUB. Usage ----- @@ -45,13 +82,13 @@ To enable KASAN, configure the kernel with:: CONFIG_KASAN=y -and choose between ``CONFIG_KASAN_GENERIC`` (to enable generic KASAN), -``CONFIG_KASAN_SW_TAGS`` (to enable software tag-based KASAN), and -``CONFIG_KASAN_HW_TAGS`` (to enable hardware tag-based KASAN). +and choose between ``CONFIG_KASAN_GENERIC`` (to enable Generic KASAN), +``CONFIG_KASAN_SW_TAGS`` (to enable Software Tag-Based KASAN), and +``CONFIG_KASAN_HW_TAGS`` (to enable Hardware Tag-Based KASAN). -For software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and +For the software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and ``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types. -The former produces a smaller binary while the latter is 1.1-2 times faster. +The former produces a smaller binary while the latter is up to 2 times faster. To include alloc and free stack traces of affected slab objects into reports, enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected @@ -146,7 +183,7 @@ is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the memory state section of the report shows the state of one of the memory granules that surround the accessed address. -For generic KASAN, the size of each memory granule is 8. The state of each +For Generic KASAN, the size of each memory granule is 8. The state of each granule is encoded in one shadow byte. Those 8 bytes can be accessible, partially accessible, freed, or be a part of a redzone. KASAN uses the following encoding for each shadow byte: 00 means that all 8 bytes of the corresponding @@ -181,14 +218,14 @@ By default, KASAN prints a bug report only for the first invalid memory access. With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This effectively disables ``panic_on_warn`` for KASAN reports. -Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot +Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot parameter can be used to control panic and reporting behaviour: - ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN report or also panic the kernel (default: ``report``). The panic happens even if ``kasan_multi_shot`` is enabled. -Hardware tag-based KASAN mode (see the section about various modes below) is +Hardware Tag-Based KASAN mode (see the section about various modes below) is intended for use in production as a security mitigation. Therefore, it supports additional boot parameters that allow disabling KASAN or controlling features: @@ -250,49 +287,46 @@ outline-instrumented kernel. Generic KASAN is the only mode that delays the reuse of freed objects via quarantine (see mm/kasan/quarantine.c for implementation). -Software tag-based KASAN +Software Tag-Based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ -Software tag-based KASAN uses a software memory tagging approach to checking +Software Tag-Based KASAN uses a software memory tagging approach to checking access validity. It is currently only implemented for the arm64 architecture. -Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs +Software Tag-Based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs to store a pointer tag in the top byte of kernel pointers. It uses shadow memory to store memory tags associated with each 16-byte memory cell (therefore, it dedicates 1/16th of the kernel memory for shadow memory). -On each memory allocation, software tag-based KASAN generates a random tag, tags +On each memory allocation, Software Tag-Based KASAN generates a random tag, tags the allocated memory with this tag, and embeds the same tag into the returned pointer. -Software tag-based KASAN uses compile-time instrumentation to insert checks +Software Tag-Based KASAN uses compile-time instrumentation to insert checks before each memory access. These checks make sure that the tag of the memory that is being accessed is equal to the tag of the pointer that is used to access -this memory. In case of a tag mismatch, software tag-based KASAN prints a bug +this memory. In case of a tag mismatch, Software Tag-Based KASAN prints a bug report. -Software tag-based KASAN also has two instrumentation modes (outline, which +Software Tag-Based KASAN also has two instrumentation modes (outline, which emits callbacks to check memory accesses; and inline, which performs the shadow memory checks inline). With outline instrumentation mode, a bug report is printed from the function that performs the access check. With inline instrumentation, a ``brk`` instruction is emitted by the compiler, and a dedicated ``brk`` handler is used to print bug reports. -Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through +Software Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of slab, page_alloc, -and vmalloc memory. - -Hardware tag-based KASAN +Hardware Tag-Based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ -Hardware tag-based KASAN is similar to the software mode in concept but uses +Hardware Tag-Based KASAN is similar to the software mode in concept but uses hardware memory tagging support instead of compiler instrumentation and shadow memory. -Hardware tag-based KASAN is currently only implemented for arm64 architecture +Hardware Tag-Based KASAN is currently only implemented for arm64 architecture and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5 Instruction Set Architecture and Top Byte Ignore (TBI). @@ -302,21 +336,18 @@ access, hardware makes sure that the tag of the memory that is being accessed is equal to the tag of the pointer that is used to access this memory. In case of a tag mismatch, a fault is generated, and a report is printed. -Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through +Hardware Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of slab, page_alloc, -and VM_ALLOC-based vmalloc memory. - -If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN +If the hardware does not support MTE (pre ARMv8.5), Hardware Tag-Based KASAN will not be enabled. In this case, all KASAN boot parameters are ignored. Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not support MTE (but supports TBI). -Hardware tag-based KASAN only reports the first found bug. After that, MTE tag +Hardware Tag-Based KASAN only reports the first found bug. After that, MTE tag checking gets disabled. Shadow memory @@ -414,19 +445,18 @@ generic ``noinstr`` one. Note that disabling compiler instrumentation (either on a per-file or a per-function basis) makes KASAN ignore the accesses that happen directly in that code for software KASAN modes. It does not help when the accesses happen -indirectly (through calls to instrumented functions) or with the hardware -tag-based mode that does not use compiler instrumentation. +indirectly (through calls to instrumented functions) or with Hardware +Tag-Based KASAN, which does not use compiler instrumentation. For software KASAN modes, to disable KASAN reports in a part of the kernel code for the current task, annotate this part of the code with a ``kasan_disable_current()``/``kasan_enable_current()`` section. This also disables the reports for indirect accesses that happen through function calls. -For tag-based KASAN modes (include the hardware one), to disable access -checking, use ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that -temporarily disabling access checking via ``page_kasan_tag_reset()`` requires -saving and restoring the per-page KASAN tag via -``page_kasan_tag``/``page_kasan_tag_set``. +For tag-based KASAN modes, to disable access checking, use +``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that temporarily +disabling access checking via ``page_kasan_tag_reset()`` requires saving and +restoring the per-page KASAN tag via ``page_kasan_tag``/``page_kasan_tag_set``. Tests ~~~~~ -- cgit v1.2.3 From ca89f2a2e66d0bc0a5929dafaf6270d35374c7a6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 12 May 2022 20:23:09 -0700 Subject: kasan: move boot parameters section in documentation Move the "Boot parameters" section in KASAN documentation next to the section that describes KASAN build options. No content changes. Link: https://lkml.kernel.org/r/870628e1293b4f44edf7cbcb92374ff9eb7503d7.1652203271.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/ec9c923f35e7c5312836c4624a7f317dc1ee2c1c.1652123204.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 82 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 2ed0b77d1db6..1772fd457fed 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -94,6 +94,47 @@ To include alloc and free stack traces of affected slab objects into reports, enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``. +Boot parameters +~~~~~~~~~~~~~~~ + +KASAN is affected by the generic ``panic_on_warn`` command line parameter. +When it is enabled, KASAN panics the kernel after printing a bug report. + +By default, KASAN prints a bug report only for the first invalid memory access. +With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This +effectively disables ``panic_on_warn`` for KASAN reports. + +Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot +parameter can be used to control panic and reporting behaviour: + +- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN + report or also panic the kernel (default: ``report``). The panic happens even + if ``kasan_multi_shot`` is enabled. + +Hardware Tag-Based KASAN mode (see the section about various modes below) is +intended for use in production as a security mitigation. Therefore, it supports +additional boot parameters that allow disabling KASAN or controlling features: + +- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). + +- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN + is configured in synchronous, asynchronous or asymmetric mode of + execution (default: ``sync``). + Synchronous mode: a bad access is detected immediately when a tag + check fault occurs. + Asynchronous mode: a bad access detection is delayed. When a tag check + fault occurs, the information is stored in hardware (in the TFSR_EL1 + register for arm64). The kernel periodically checks the hardware and + only reports tag faults during these checks. + Asymmetric mode: a bad access is detected synchronously on reads and + asynchronously on writes. + +- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc + allocations (default: ``on``). + +- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack + traces collection (default: ``on``). + Error reports ~~~~~~~~~~~~~ @@ -208,47 +249,6 @@ traces point to places in code that interacted with the object but that are not directly present in the bad access stack trace. Currently, this includes call_rcu() and workqueue queuing. -Boot parameters -~~~~~~~~~~~~~~~ - -KASAN is affected by the generic ``panic_on_warn`` command line parameter. -When it is enabled, KASAN panics the kernel after printing a bug report. - -By default, KASAN prints a bug report only for the first invalid memory access. -With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This -effectively disables ``panic_on_warn`` for KASAN reports. - -Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot -parameter can be used to control panic and reporting behaviour: - -- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). The panic happens even - if ``kasan_multi_shot`` is enabled. - -Hardware Tag-Based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore, it supports -additional boot parameters that allow disabling KASAN or controlling features: - -- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). - -- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN - is configured in synchronous, asynchronous or asymmetric mode of - execution (default: ``sync``). - Synchronous mode: a bad access is detected immediately when a tag - check fault occurs. - Asynchronous mode: a bad access detection is delayed. When a tag check - fault occurs, the information is stored in hardware (in the TFSR_EL1 - register for arm64). The kernel periodically checks the hardware and - only reports tag faults during these checks. - Asymmetric mode: a bad access is detected synchronously on reads and - asynchronously on writes. - -- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc - allocations (default: ``on``). - -- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack - traces collection (default: ``on``). - Implementation details ---------------------- -- cgit v1.2.3 From fe30ddca9f18fb56bf1dd161ffd9d8a04bba5713 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 12 May 2022 20:23:09 -0700 Subject: kasan: clean-up kconfig options descriptions Various readability clean-ups of KASAN Kconfig options. No functional changes. Link: https://lkml.kernel.org/r/c160840dd9e4b1ad5529ecfdb0bba35d9a14d826.1652203271.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/47afaecec29221347bee49f58c258ac1ced3b429.1652123204.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 168 ++++++++++++++++++++++++++---------------------------- 1 file changed, 82 insertions(+), 86 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 1f3e620188a2..f0973da583e0 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only + # This config refers to the generic KASAN mode. config HAVE_ARCH_KASAN bool @@ -15,9 +16,8 @@ config HAVE_ARCH_KASAN_VMALLOC config ARCH_DISABLE_KASAN_INLINE bool help - An architecture might not support inline instrumentation. - When this option is selected, inline and stack instrumentation are - disabled. + Disables both inline and stack instrumentation. Selected by + architectures that do not support these instrumentation types. config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) @@ -26,13 +26,13 @@ config CC_HAS_KASAN_SW_TAGS def_bool $(cc-option, -fsanitize=kernel-hwaddress) # This option is only required for software KASAN modes. -# Old GCC versions don't have proper support for no_sanitize_address. +# Old GCC versions do not have proper support for no_sanitize_address. # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89124 for details. config CC_HAS_WORKING_NOSANITIZE_ADDRESS def_bool !CC_IS_GCC || GCC_VERSION >= 80300 menuconfig KASAN - bool "KASAN: runtime memory debugger" + bool "KASAN: dynamic memory safety error detector" depends on (((HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \ (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \ CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ @@ -40,10 +40,13 @@ menuconfig KASAN depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) select STACKDEPOT_ALWAYS_INIT help - Enables KASAN (KernelAddressSANitizer) - runtime memory debugger, - designed to find out-of-bounds accesses and use-after-free bugs. + Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety + error detector designed to find out-of-bounds and use-after-free bugs. + See Documentation/dev-tools/kasan.rst for details. + For better error reports, also enable CONFIG_STACKTRACE. + if KASAN choice @@ -51,75 +54,71 @@ choice default KASAN_GENERIC help KASAN has three modes: - 1. generic KASAN (similar to userspace ASan, - x86_64/arm64/xtensa, enabled with CONFIG_KASAN_GENERIC), - 2. software tag-based KASAN (arm64 only, based on software - memory tagging (similar to userspace HWASan), enabled with - CONFIG_KASAN_SW_TAGS), and - 3. hardware tag-based KASAN (arm64 only, based on hardware - memory tagging, enabled with CONFIG_KASAN_HW_TAGS). - All KASAN modes are strictly debugging features. + 1. Generic KASAN (supported by many architectures, enabled with + CONFIG_KASAN_GENERIC, similar to userspace ASan), + 2. Software Tag-Based KASAN (arm64 only, based on software memory + tagging, enabled with CONFIG_KASAN_SW_TAGS, similar to userspace + HWASan), and + 3. Hardware Tag-Based KASAN (arm64 only, based on hardware memory + tagging, enabled with CONFIG_KASAN_HW_TAGS). - For better error reports enable CONFIG_STACKTRACE. + See Documentation/dev-tools/kasan.rst for details about each mode. config KASAN_GENERIC - bool "Generic mode" + bool "Generic KASAN" depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS select SLUB_DEBUG if SLUB select CONSTRUCTORS help - Enables generic KASAN mode. + Enables Generic KASAN. - This mode is supported in both GCC and Clang. With GCC it requires - version 8.3.0 or later. Any supported Clang version is compatible, - but detection of out-of-bounds accesses for global variables is - supported only since Clang 11. + Requires GCC 8.3.0+ or Clang. - This mode consumes about 1/8th of available memory at kernel start - and introduces an overhead of ~x1.5 for the rest of the allocations. + Consumes about 1/8th of available memory at kernel start and adds an + overhead of ~50% for dynamic allocations. The performance slowdown is ~x3. - Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB - (the resulting kernel does not boot). + (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) config KASAN_SW_TAGS - bool "Software tag-based mode" + bool "Software Tag-Based KASAN" depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS select SLUB_DEBUG if SLUB select CONSTRUCTORS help - Enables software tag-based KASAN mode. + Enables Software Tag-Based KASAN. - This mode require software memory tagging support in the form of - HWASan-like compiler instrumentation. + Requires GCC 11+ or Clang. - Currently this mode is only implemented for arm64 CPUs and relies on - Top Byte Ignore. This mode requires Clang. + Supported only on arm64 CPUs and relies on Top Byte Ignore. - This mode consumes about 1/16th of available memory at kernel start - and introduces an overhead of ~20% for the rest of the allocations. - This mode may potentially introduce problems relating to pointer - casting and comparison, as it embeds tags into the top byte of each - pointer. + Consumes about 1/16th of available memory at kernel start and + add an overhead of ~20% for dynamic allocations. - Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB - (the resulting kernel does not boot). + May potentially introduce problems related to pointer casting and + comparison, as it embeds a tag into the top byte of each pointer. + + (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) config KASAN_HW_TAGS - bool "Hardware tag-based mode" + bool "Hardware Tag-Based KASAN" depends on HAVE_ARCH_KASAN_HW_TAGS depends on SLUB help - Enables hardware tag-based KASAN mode. + Enables Hardware Tag-Based KASAN. + + Requires GCC 10+ or Clang 12+. - This mode requires hardware memory tagging support, and can be used - by any architecture that provides it. + Supported only on arm64 CPUs starting from ARMv8.5 and relies on + Memory Tagging Extension and Top Byte Ignore. - Currently this mode is only implemented for arm64 CPUs starting from - ARMv8.5 and relies on Memory Tagging Extension and Top Byte Ignore. + Consumes about 1/32nd of available memory. + + May potentially introduce problems related to pointer casting and + comparison, as it embeds a tag into the top byte of each pointer. endchoice @@ -131,83 +130,80 @@ choice config KASAN_OUTLINE bool "Outline instrumentation" help - Before every memory access compiler insert function call - __asan_load*/__asan_store*. These functions performs check - of shadow memory. This is slower than inline instrumentation, - however it doesn't bloat size of kernel's .text section so - much as inline does. + Makes the compiler insert function calls that check whether the memory + is accessible before each memory access. Slower than KASAN_INLINE, but + does not bloat the size of the kernel's .text section so much. config KASAN_INLINE bool "Inline instrumentation" depends on !ARCH_DISABLE_KASAN_INLINE help - Compiler directly inserts code checking shadow memory before - memory accesses. This is faster than outline (in some workloads - it gives about x2 boost over outline instrumentation), but - make kernel's .text size much bigger. + Makes the compiler directly insert memory accessibility checks before + each memory access. Faster than KASAN_OUTLINE (gives ~x2 boost for + some workloads), but makes the kernel's .text size much bigger. endchoice config KASAN_STACK - bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST + bool "Stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST depends on KASAN_GENERIC || KASAN_SW_TAGS depends on !ARCH_DISABLE_KASAN_INLINE default y if CC_IS_GCC help - The LLVM stack address sanitizer has a know problem that - causes excessive stack usage in a lot of functions, see - https://bugs.llvm.org/show_bug.cgi?id=38809 - Disabling asan-stack makes it safe to run kernels build - with clang-8 with KASAN enabled, though it loses some of - the functionality. - This feature is always disabled when compile-testing with clang - to avoid cluttering the output in stack overflow warnings, - but clang users can still enable it for builds without - CONFIG_COMPILE_TEST. On gcc it is assumed to always be safe - to use and enabled by default. - If the architecture disables inline instrumentation, stack - instrumentation is also disabled as it adds inline-style - instrumentation that is run unconditionally. + Disables stack instrumentation and thus KASAN's ability to detect + out-of-bounds bugs in stack variables. + + With Clang, stack instrumentation has a problem that causes excessive + stack usage, see https://bugs.llvm.org/show_bug.cgi?id=38809. Thus, + with Clang, this option is deemed unsafe. + + This option is always disabled when compile-testing with Clang to + avoid cluttering the log with stack overflow warnings. + + With GCC, enabling stack instrumentation is assumed to be safe. + + If the architecture disables inline instrumentation via + ARCH_DISABLE_KASAN_INLINE, stack instrumentation gets disabled + as well, as it adds inline-style instrumentation that is run + unconditionally. config KASAN_TAGS_IDENTIFY - bool "Enable memory corruption identification" + bool "Memory corruption type identification" depends on KASAN_SW_TAGS || KASAN_HW_TAGS help - This option enables best-effort identification of bug type - (use-after-free or out-of-bounds) at the cost of increased - memory consumption. + Enables best-effort identification of the bug types (use-after-free + or out-of-bounds) at the cost of increased memory consumption. + Only applicable for the tag-based KASAN modes. config KASAN_VMALLOC bool "Check accesses to vmalloc allocations" depends on HAVE_ARCH_KASAN_VMALLOC help - This mode makes KASAN check accesses to vmalloc allocations for - validity. + Makes KASAN check the validity of accesses to vmalloc allocations. - With software KASAN modes, checking is done for all types of vmalloc - allocations. Enabling this option leads to higher memory usage. + With software KASAN modes, all types vmalloc allocations are + checked. Enabling this option leads to higher memory usage. - With hardware tag-based KASAN, only VM_ALLOC mappings are checked. - There is no additional memory usage. + With Hardware Tag-Based KASAN, only non-executable VM_ALLOC mappings + are checked. There is no additional memory usage. config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS depends on KASAN && KUNIT default KUNIT_ALL_TESTS help - This is a KUnit test suite doing various nasty things like - out of bounds and use after free accesses. It is useful for testing - kernel debugging features like KASAN. + A KUnit-based KASAN test suite. Triggers different kinds of + out-of-bounds and use-after-free accesses. Useful for testing whether + KASAN can detect certain bug types. For more information on KUnit and unit tests in general, please refer - to the KUnit documentation in Documentation/dev-tools/kunit. + to the KUnit documentation in Documentation/dev-tools/kunit/. config KASAN_MODULE_TEST tristate "KUnit-incompatible tests of KASAN bug detection capabilities" depends on m && KASAN && !KASAN_HW_TAGS help - This is a part of the KASAN test suite that is incompatible with - KUnit. Currently includes tests that do bad copy_from/to_user - accesses. + A part of the KASAN test suite that is not integrated with KUnit. + Incompatible with Hardware Tag-Based KASAN. endif # KASAN -- cgit v1.2.3 From cd8c1fd8cdd14158f2d8bea2d1bfe8015dccfa3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 May 2022 20:23:09 -0700 Subject: mm/page_owner: use strscpy() instead of strlcpy() current->comm[] is not a string (no guarantee for a zero byte in it). strlcpy(s1, s2, l) is calling strlen(s2), potentially causing out-of-bound access, as reported by syzbot: detected buffer overflow in __fortify_strlen ------------[ cut here ]------------ kernel BUG at lib/string_helpers.c:980! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 4087 Comm: dhcpcd-run-hooks Not tainted 5.18.0-rc3-syzkaller-01537-g20b87e7c29df #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x18/0x1a lib/string_helpers.c:980 Code: 8c e8 c5 ba e1 fa e9 23 0f bf fa e8 0b 5d 8c f8 eb db 55 48 89 fd e8 e0 49 40 f8 48 89 ee 48 c7 c7 80 f5 26 8a e8 99 09 f1 ff <0f> 0b e8 ca 49 40 f8 48 8b 54 24 18 4c 89 f1 48 c7 c7 00 00 27 8a RSP: 0018:ffffc900000074a8 EFLAGS: 00010286 RAX: 000000000000002c RBX: ffff88801226b728 RCX: 0000000000000000 RDX: ffff8880198e0000 RSI: ffffffff81600458 RDI: fffff52000000e87 RBP: ffffffff89da2aa0 R08: 000000000000002c R09: 0000000000000000 R10: ffffffff815fae2e R11: 0000000000000000 R12: ffff88801226b700 R13: ffff8880198e0830 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5876ad6ff8 CR3: 000000001a48c000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: __fortify_strlen include/linux/fortify-string.h:128 [inline] strlcpy include/linux/fortify-string.h:143 [inline] __set_page_owner_handle+0x2b1/0x3e0 mm/page_owner.c:171 __set_page_owner+0x3e/0x50 mm/page_owner.c:190 prep_new_page mm/page_alloc.c:2441 [inline] get_page_from_freelist+0xba2/0x3e00 mm/page_alloc.c:4182 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5408 alloc_pages+0x1aa/0x310 mm/mempolicy.c:2272 alloc_slab_page mm/slub.c:1799 [inline] allocate_slab+0x26c/0x3c0 mm/slub.c:1944 new_slab mm/slub.c:2004 [inline] ___slab_alloc+0x8df/0xf20 mm/slub.c:3005 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3092 slab_alloc_node mm/slub.c:3183 [inline] slab_alloc mm/slub.c:3225 [inline] __kmem_cache_alloc_lru mm/slub.c:3232 [inline] kmem_cache_alloc+0x360/0x3b0 mm/slub.c:3242 dst_alloc+0x146/0x1f0 net/core/dst.c:92 Link: https://lkml.kernel.org/r/20220509145949.265184-1-eric.dumazet@gmail.com Fixes: 865ed6a32786 ("mm/page_owner: record task command name") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Waiman Long Acked-by: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index fb3a05fdebdb..19bc559e4904 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -168,7 +168,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->pid = current->pid; page_owner->tgid = current->tgid; page_owner->ts_nsec = local_clock(); - strlcpy(page_owner->comm, current->comm, + strscpy(page_owner->comm, current->comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); -- cgit v1.2.3 From 60f272f6b09a8f14156df88cccd21447ab394452 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 12 May 2022 20:23:09 -0700 Subject: mm/memory-failure.c: move clear_hwpoisoned_pages Patch series "memory-failure: fix hwpoison_filter", v2. As well known, the memory failure mechanism handles memory corrupted event, and try to send SIGBUS to the user process which uses this corrupted page. For the virtualization case, QEMU catches SIGBUS and tries to inject MCE into the guest, and the guest handles memory failure again. Thus the guest gets the minimal effect from hardware memory corruption. The further step I'm working on: 1, try to modify code to decrease poisoned pages in a single place (mm/memofy-failure.c: simplify num_poisoned_pages_dec in this series). 2, try to use page_handle_poison() to handle SetPageHWPoison() and num_poisoned_pages_inc() together. It would be best to call num_poisoned_pages_inc() in a single place too. 3, introduce memory failure notifier list in memory-failure.c: notify the corrupted PFN to someone who registers this list. If I can complete [1] and [2] part, [3] will be quite easy(just call notifier list after increasing poisoned page). 4, introduce memory recover VQ for memory balloon device, and registers memory failure notifier list. During the guest kernel handles memory failure, balloon device gets notified by memory failure notifier list, and tells the host to recover the corrupted PFN(GPA) by the new VQ. 5, host side remaps the corrupted page(HVA), and tells the guest side to unpoison the PFN(GPA). Then the guest fixes the corrupted page(GPA) dynamically. This patch (of 5): clear_hwpoisoned_pages() clears HWPoison flag and decreases the number of poisoned pages, this actually works as part of memory failure. Move this function from sparse.c to memory-failure.c, finally there is no CONFIG_MEMORY_FAILURE in sparse.c. Link: https://lkml.kernel.org/r/20220509105641.491313-1-pizhenwei@bytedance.com Link: https://lkml.kernel.org/r/20220509105641.491313-2-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/internal.h | 11 +++++++++++ mm/memory-failure.c | 21 +++++++++++++++++++++ mm/sparse.c | 27 --------------------------- 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index a770029beb08..6d188161b20e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -711,6 +711,9 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) } #endif +/* + * mm/memory-failure.c + */ extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; @@ -720,6 +723,14 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; +#ifdef CONFIG_MEMORY_FAILURE +void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); +#else +static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ +} +#endif + extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1d117190c350..e7a13e90bd05 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2392,3 +2392,24 @@ retry: return ret; } + +void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ + int i; + + /* + * A further optimization is to have per section refcounted + * num_poisoned_pages. But that would need more space per memmap, so + * for now just do a quick global check to speed up this routine in the + * absence of bad pages. + */ + if (atomic_long_read(&num_poisoned_pages) == 0) + return; + + for (i = 0; i < nr_pages; i++) { + if (PageHWPoison(&memmap[i])) { + num_poisoned_pages_dec(); + ClearPageHWPoison(&memmap[i]); + } + } +} diff --git a/mm/sparse.c b/mm/sparse.c index d2d76d158b39..cb3bfae64036 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -922,33 +922,6 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, return 0; } -#ifdef CONFIG_MEMORY_FAILURE -static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - num_poisoned_pages_dec(); - ClearPageHWPoison(&memmap[i]); - } - } -} -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif - void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) -- cgit v1.2.3 From c8bd84f73fd6215d5b8d0b3cfc914a3671b16d1c Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 12 May 2022 20:23:09 -0700 Subject: mm/memory-failure.c: simplify num_poisoned_pages_dec Don't decrease the number of poisoned pages in page_alloc.c, let the memory-failure.c do inc/dec poisoned pages only. Also simplify unpoison_memory(), only decrease the number of poisoned pages when: - TestClearPageHWPoison() succeed - put_page_back_buddy succeed After decreasing, print necessary log. Finally, remove clear_page_hwpoison() and unpoison_taken_off_page(). Link: https://lkml.kernel.org/r/20220509105641.491313-3-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 37 +++++++++---------------------------- mm/page_alloc.c | 1 - 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e7a13e90bd05..837975af2152 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2092,28 +2092,6 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) -static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) -{ - if (TestClearPageHWPoison(p)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - page_to_pfn(p), rs); - num_poisoned_pages_dec(); - return 1; - } - return 0; -} - -static inline int unpoison_taken_off_page(struct ratelimit_state *rs, - struct page *p) -{ - if (put_page_back_buddy(p)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - page_to_pfn(p), rs); - return 0; - } - return -EBUSY; -} - /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -2131,6 +2109,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int ret = -EBUSY; + int freeit = 0; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2171,18 +2150,15 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { - if (clear_page_hwpoison(&unpoison_rs, page)) - ret = 0; - else - ret = -EBUSY; + ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; } else if (ret < 0) { if (ret == -EHWPOISON) { - ret = unpoison_taken_off_page(&unpoison_rs, p); + ret = put_page_back_buddy(p) ? 0 : -EBUSY; } else unpoison_pr_info("Unpoison: failed to grab page %#lx\n", pfn, &unpoison_rs); } else { - int freeit = clear_page_hwpoison(&unpoison_rs, p); + freeit = !!TestClearPageHWPoison(p); put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { @@ -2193,6 +2169,11 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); + if (!ret || freeit) { + num_poisoned_pages_dec(); + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), &unpoison_rs); + } return ret; } EXPORT_SYMBOL(unpoison_memory); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 732e01e9d0e0..a4a8fbbd6217 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9497,7 +9497,6 @@ bool put_page_back_buddy(struct page *page) ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { - num_poisoned_pages_dec(); ret = true; } } -- cgit v1.2.3 From 9113eaf331bf44579882c001867773cf1b3364fd Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 12 May 2022 20:23:10 -0700 Subject: mm/memory-failure.c: add hwpoison_filter for soft offline hwpoison_filter is missing in the soft offline path, this leads an issue: after enabling the corrupt filter, the user process still has a chance to inject hwpoison fault by madvise(addr, len, MADV_SOFT_OFFLINE) at PFN which is expected to reject. Also do a minor change in comment of memory_failure(). Link: https://lkml.kernel.org/r/20220509105641.491313-4-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 837975af2152..7ee80350a9a5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1751,7 +1751,7 @@ static DEFINE_MUTEX(mf_mutex); * enabled and no spinlocks hold. * * Return: 0 for successfully handled the memory error, - * -EOPNOTSUPP for memory_filter() filtered the error event, + * -EOPNOTSUPP for hwpoison_filter() filtered the error event, * < 0(except -EOPNOTSUPP) on failure. */ int memory_failure(unsigned long pfn, int flags) @@ -2308,7 +2308,9 @@ static void put_ref_page(struct page *page) * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * - * Returns 0 on success, otherwise negated errno. + * Returns 0 on success + * -EOPNOTSUPP for hwpoison_filter() filtered the error event + * < 0 otherwise negated errno. * * Soft offline a page, by migration or invalidation, * without killing anything. This is for the case when @@ -2359,6 +2361,16 @@ retry: ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE); put_online_mems(); + if (hwpoison_filter(page)) { + if (ret > 0) + put_page(page); + else + put_ref_page(ref_page); + + mutex_unlock(&mf_mutex); + return -EOPNOTSUPP; + } + if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { -- cgit v1.2.3 From f0696cb4068a0d79cf9795ad34560d2f6200d42c Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 12 May 2022 20:23:10 -0700 Subject: mm/hwpoison: disable hwpoison filter during removing hwpoison filter is enabled by hwpoison-inject module, after removing this module, hwpoison filter still works. What is worse, user can not find the debugfs entries to know this. Disable the hwpoison filter during removing hwpoison-inject module. Link: https://lkml.kernel.org/r/20220509105641.491313-5-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/hwpoison-inject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index bb0cea5468cb..5c0cddd81505 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -65,6 +65,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { + hwpoison_filter_enable = 0; debugfs_remove_recursive(hwpoison_dir); } -- cgit v1.2.3 From e240ac52f7da5986f9dcbe29d423b7b2f141b41b Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 12 May 2022 20:23:10 -0700 Subject: mm/memory-failure.c: simplify num_poisoned_pages_inc/dec Originally, do num_poisoned_pages_inc() in memory failure routine, use num_poisoned_pages_dec() to rollback the number if filtered/ cancelled. Suggested by Naoya, do num_poisoned_pages_inc() only in action_result(), this make this clear and simple. Link: https://lkml.kernel.org/r/20220509105641.491313-6-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7ee80350a9a5..01f8b63d3621 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1132,6 +1132,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); + num_poisoned_pages_inc(); pr_err("Memory failure: %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -1587,8 +1588,6 @@ retry: goto out; } - num_poisoned_pages_inc(); - /* * Handling free hugepage. The possible race with hugepage allocation * or demotion can be prevented by PageHWPoison flag. @@ -1806,7 +1805,6 @@ try_again: } hpage = compound_head(p); - num_poisoned_pages_inc(); /* * We need/can do nothing about count=0 pages. @@ -1830,7 +1828,6 @@ try_again: /* We lost the race, try again */ if (retry) { ClearPageHWPoison(p); - num_poisoned_pages_dec(); retry = false; goto try_again; } @@ -1906,8 +1903,7 @@ try_again: */ if (PageCompound(p)) { if (retry) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); + ClearPageHWPoison(p); unlock_page(p); put_page(p); flags &= ~MF_COUNT_INCREASED; @@ -1929,8 +1925,7 @@ try_again: page_flags = p->flags; if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); + TestClearPageHWPoison(p); unlock_page(p); put_page(p); res = -EOPNOTSUPP; -- cgit v1.2.3 From e7392b4eca84e86718a7b73aea8fa5573b06ba76 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 13 May 2022 16:48:55 -0700 Subject: mm/highmem: fix kernel-doc warnings in highmem*.h Patch series "Extend and reorganize Highmem's documentation", v4. This series has the purpose to extend and reorganize Highmem's documentation. This is a work in progress because some information should still be moved from highmem.rst to highmem.h and highmem-internal.h. Specifically I'm talking about moving the "how to" information to the relevant headers, as it as been suggested by Ira Weiny (Intel). Also, this is a work in progress because some kdocs in highmem.h and highmem-internal.h should be improved. This patch (of 4): `scripts/kernel-doc -v -none include/linux/highmem*` reports the following warnings: include/linux/highmem.h:160: warning: expecting prototype for kunmap_atomic(). Prototype was for nr_free_highpages() instead include/linux/highmem.h:204: warning: No description found for return value of 'alloc_zeroed_user_highpage_movable' include/linux/highmem-internal.h:256: warning: Function parameter or member '__addr' not described in 'kunmap_atomic' include/linux/highmem-internal.h:256: warning: Excess function parameter 'addr' description in 'kunmap_atomic' Fix these warnings by (1) moving the kernel-doc comments from highmem.h to highmem-internal.h (which is the file were the kunmap_atomic() macro is actually defined), (2) extending and merging it with the comment which was already in highmem-internal.h, and (3) using correct parameter names (4) correcting a few technical inaccuracies in comments, and (5) adding a deprecation notice in kunmap_atomic() for consistency with kmap_atomic(). Link: https://lkml.kernel.org/r/20220428212455.892-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220428212455.892-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Ira Weiny Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Catalin Marinas Cc: Will Deacon Cc: Peter Collingbourne Cc: Vlastimil Babka Cc: Jonathan Corbet Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 18 +++++++++++++++--- include/linux/highmem.h | 22 ++++++++-------------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a77be5630209..a694ca95c4ed 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -236,9 +236,21 @@ static inline unsigned long totalhigh_pages(void) { return 0UL; } #endif /* CONFIG_HIGHMEM */ -/* - * Prevent people trying to call kunmap_atomic() as if it were kunmap() - * kunmap_atomic() should get the return value of kmap_atomic, not the page. +/** + * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! + * @__addr: Virtual address to be unmapped + * + * Unmaps an address previously mapped by kmap_atomic() and re-enables + * pagefaults. Depending on PREEMP_RT configuration, re-enables also + * migration and preemption. Users should not count on these side effects. + * + * Mappings should be unmapped in the reverse order that they were mapped. + * See kmap_local_page() for details on nesting. + * + * @__addr can be any address within the mapped page, so there is no need + * to subtract any offset that has been added. In contrast to kunmap(), + * this function takes the address returned from kmap_atomic(), not the + * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 67720bfd6ade..c05ae395898a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -37,7 +37,7 @@ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() - * @addr: Virtual address to be unmapped + * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. @@ -138,24 +138,16 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * * Returns: The virtual address of the mapping * - * Effectively a wrapper around kmap_local_page() which disables pagefaults - * and preemption. + * In fact a wrapper around kmap_local_page() which also disables pagefaults + * and, depending on PREEMPT_RT configuration, also CPU migration and + * preemption. Therefore users should not count on the latter two side effects. + * + * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. */ static inline void *kmap_atomic(struct page *page); -/** - * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - * @addr: Virtual address to be unmapped - * - * Counterpart to kmap_atomic(). - * - * Effectively a wrapper around kunmap_local() which additionally undoes - * the side effects of kmap_atomic(), i.e. reenabling pagefaults and - * preemption. - */ - /* Highmem related interfaces for management code */ static inline unsigned int nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); @@ -191,6 +183,8 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) * @vma: The VMA the page is to be allocated for * @vaddr: The virtual address the page will be inserted into * + * Returns: The allocated and zeroed HIGHMEM page + * * This function will allocate a page for a VMA that the caller knows will * be able to migrate in the future using move_pages() or reclaimed * -- cgit v1.2.3 From 174270c2d664ca8ff60d2d95211b4790cedf881c Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 13 May 2022 16:48:55 -0700 Subject: Documentation/vm: include kdocs from highmem*.h into highmem.rst kernel-docs that are in include/linux/highmem.h and in include/linux/highmem-internal.h should be included in highmem.rst. Use kdocs directives to include the above-mentioned comments into highmem.rst. Link: https://lkml.kernel.org/r/20220428212455.892-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Acked-by: Mike Rapoport Reviewed-by: Ira Weiny Suggested-by: Ira Weiny Reviewed-by: Sebastian Andrzej Siewior Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Peter Collingbourne Cc: Vlastimil Babka Cc: Will Deacon Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/vm/highmem.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst index 0f69a9fec34d..ccff08a8211d 100644 --- a/Documentation/vm/highmem.rst +++ b/Documentation/vm/highmem.rst @@ -145,3 +145,10 @@ The general recommendation is that you don't use more than 8GiB on a 32-bit machine - although more might work for you and your workload, you're pretty much on your own - don't expect kernel developers to really care much if things come apart. + + +Functions +========= + +.. kernel-doc:: include/linux/highmem.h +.. kernel-doc:: include/linux/highmem-internal.h -- cgit v1.2.3 From 85a85e7601263f2b4edc86136dd0172c2cfbfaa1 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 13 May 2022 16:48:55 -0700 Subject: Documentation/vm: move "Using kmap-atomic" to highmem.h The use of kmap_atomic() is new code is being deprecated in favor of kmap_local_page(). For this reason the "Using kmap_atomic" section in highmem.rst is obsolete and unnecessary, but it can still help developers if it were moved to kdocs in highmem.h. Therefore, move the relevant parts of this section from highmem.rst and merge them with the kdocs in highmem.h. Link: https://lkml.kernel.org/r/20220428212455.892-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Ira Weiny Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Mike Rapoport Cc: Peter Collingbourne Cc: Vlastimil Babka Cc: Will Deacon Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/vm/highmem.rst | 35 ----------------------------------- include/linux/highmem.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst index ccff08a8211d..e05bf5524174 100644 --- a/Documentation/vm/highmem.rst +++ b/Documentation/vm/highmem.rst @@ -72,41 +72,6 @@ The kernel contains several ways of creating temporary mappings: It may be assumed that k[un]map_atomic() won't fail. -Using kmap_atomic -================= - -When and where to use kmap_atomic() is straightforward. It is used when code -wants to access the contents of a page that might be allocated from high memory -(see __GFP_HIGHMEM), for example a page in the pagecache. The API has two -functions, and they can be used in a manner similar to the following:: - - /* Find the page of interest. */ - struct page *page = find_get_page(mapping, offset); - - /* Gain access to the contents of that page. */ - void *vaddr = kmap_atomic(page); - - /* Do something to the contents of that page. */ - memset(vaddr, 0, PAGE_SIZE); - - /* Unmap that page. */ - kunmap_atomic(vaddr); - -Note that the kunmap_atomic() call takes the result of the kmap_atomic() call -not the argument. - -If you need to map two pages because you want to copy from one page to -another you need to keep the kmap_atomic calls strictly nested, like:: - - vaddr1 = kmap_atomic(page1); - vaddr2 = kmap_atomic(page2); - - memcpy(vaddr1, vaddr2, PAGE_SIZE); - - kunmap_atomic(vaddr2); - kunmap_atomic(vaddr1); - - Cost of Temporary Mappings ========================== diff --git a/include/linux/highmem.h b/include/linux/highmem.h index c05ae395898a..3af34de54330 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -145,6 +145,37 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. + * + * It is used in atomic context when code wants to access the contents of a + * page that might be allocated from high memory (see __GFP_HIGHMEM), for + * example a page in the pagecache. The API has two functions, and they + * can be used in a manner similar to the following: + * + * -- Find the page of interest. -- + * struct page *page = find_get_page(mapping, offset); + * + * -- Gain access to the contents of that page. -- + * void *vaddr = kmap_atomic(page); + * + * -- Do something to the contents of that page. -- + * memset(vaddr, 0, PAGE_SIZE); + * + * -- Unmap that page. -- + * kunmap_atomic(vaddr); + * + * Note that the kunmap_atomic() call takes the result of the kmap_atomic() + * call, not the argument. + * + * If you need to map two pages because you want to copy from one page to + * another you need to keep the kmap_atomic calls strictly nested, like: + * + * vaddr1 = kmap_atomic(page1); + * vaddr2 = kmap_atomic(page2); + * + * memcpy(vaddr1, vaddr2, PAGE_SIZE); + * + * kunmap_atomic(vaddr2); + * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); -- cgit v1.2.3 From 110bf7a523075e42703fbe9c136ad00b867bec3a Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 13 May 2022 16:48:55 -0700 Subject: Documentation/vm: rework "Temporary Virtual Mappings" section Extend and rework the "Temporary Virtual Mappings" section of the highmem.rst documentation. Despite the local kmaps were introduced by Thomas Gleixner in October 2020, documentation was still missing information about them. These additions rely largely on Gleixner's patches, Jonathan Corbet's LWN articles, comments by Ira Weiny and Matthew Wilcox, and in-code comments from ./include/linux/highmem.h. 1) Add a paragraph to document kmap_local_page(). 2) Reorder the list of functions by decreasing order of preference of use. 3) Rework part of the kmap() entry in list. Link: https://lkml.kernel.org/r/20220428212455.892-5-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Ira Weiny Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Mike Rapoport Cc: Peter Collingbourne Cc: Vlastimil Babka Cc: Will Deacon Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/vm/highmem.rst | 70 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst index e05bf5524174..c9887f241c6c 100644 --- a/Documentation/vm/highmem.rst +++ b/Documentation/vm/highmem.rst @@ -50,26 +50,74 @@ space when they use mm context tags. Temporary Virtual Mappings ========================== -The kernel contains several ways of creating temporary mappings: +The kernel contains several ways of creating temporary mappings. The following +list shows them in order of preference of use. -* vmap(). This can be used to make a long duration mapping of multiple - physical pages into a contiguous virtual space. It needs global - synchronization to unmap. +* kmap_local_page(). This function is used to require short term mappings. + It can be invoked from any context (including interrupts) but the mappings + can only be used in the context which acquired them. + + This function should be preferred, where feasible, over all the others. -* kmap(). This permits a short duration mapping of a single page. It needs - global synchronization, but is amortized somewhat. It is also prone to - deadlocks when using in a nested fashion, and so it is not recommended for - new code. + These mappings are thread-local and CPU-local, meaning that the mapping + can only be accessed from within this thread and the thread is bound the + CPU while the mapping is active. Even if the thread is preempted (since + preemption is never disabled by the function) the CPU can not be + unplugged from the system via CPU-hotplug until the mapping is disposed. + + It's valid to take pagefaults in a local kmap region, unless the context + in which the local mapping is acquired does not allow it for other reasons. + + kmap_local_page() always returns a valid virtual address and it is assumed + that kunmap_local() will never fail. + + Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain + extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered + because the map implementation is stack based. See kmap_local_page() kdocs + (included in the "Functions" section) for details on how to manage nested + mappings. * kmap_atomic(). This permits a very short duration mapping of a single page. Since the mapping is restricted to the CPU that issued it, it performs well, but the issuing task is therefore required to stay on that CPU until it has finished, lest some other task displace its mappings. - kmap_atomic() may also be used by interrupt contexts, since it is does not - sleep and the caller may not sleep until after kunmap_atomic() is called. + kmap_atomic() may also be used by interrupt contexts, since it does not + sleep and the callers too may not sleep until after kunmap_atomic() is + called. + + Each call of kmap_atomic() in the kernel creates a non-preemptible section + and disable pagefaults. This could be a source of unwanted latency. Therefore + users should prefer kmap_local_page() instead of kmap_atomic(). - It may be assumed that k[un]map_atomic() won't fail. + It is assumed that k[un]map_atomic() won't fail. + +* kmap(). This should be used to make short duration mapping of a single + page with no restrictions on preemption or migration. It comes with an + overhead as mapping space is restricted and protected by a global lock + for synchronization. When mapping is no longer needed, the address that + the page was mapped to must be released with kunmap(). + + Mapping changes must be propagated across all the CPUs. kmap() also + requires global TLB invalidation when the kmap's pool wraps and it might + block when the mapping space is fully utilized until a slot becomes + available. Therefore, kmap() is only callable from preemptible context. + + All the above work is necessary if a mapping must last for a relatively + long time but the bulk of high-memory mappings in the kernel are + short-lived and only used in one place. This means that the cost of + kmap() is mostly wasted in such cases. kmap() was not intended for long + term mappings but it has morphed in that direction and its use is + strongly discouraged in newer code and the set of the preceding functions + should be preferred. + + On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have + no real work to do because a 64-bit address space is more than sufficient to + address all the physical memory whose pages are permanently mapped. + +* vmap(). This can be used to make a long duration mapping of multiple + physical pages into a contiguous virtual space. It needs global + synchronization to unmap. Cost of Temporary Mappings -- cgit v1.2.3 From ae07562909f3dfcdff40f87e51965728dab50485 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 May 2022 16:48:55 -0700 Subject: mm: change huge_ptep_clear_flush() to return the original pte Patch series "Fix CONT-PTE/PMD size hugetlb issue when unmapping or migrating", v4. presently, migrating a hugetlb page or unmapping a poisoned hugetlb page, we'll use ptep_clear_flush() and set_pte_at() to nuke the page table entry and remap it, and this is incorrect for CONT-PTE or CONT-PMD size hugetlb page, which will cause potential data consistent issue. This patch set will change to use hugetlb related APIs to fix this issue. Note: Mike pointed out the huge_ptep_get() will only return the one specific value, and it would not take into account the dirty or young bits of CONT-PTE/PMDs like the huge_ptep_get_and_clear() [1]. This inconsistent issue is not introduced by this patch set, and this issue will be addressed in another thread [2]. Meanwhile the uffd for hugetlb case [3] pointed out by Gerald also needs another patch to address. [1] https://lore.kernel.org/linux-mm/85bd80b4-b4fd-0d3f-a2e5-149559f2f387@oracle.com/ [2] https://lore.kernel.org/all/cover.1651998586.git.baolin.wang@linux.alibaba.com/ [3] https://lore.kernel.org/linux-mm/20220503120343.6264e126@thinkpad/ This patch (of 3): It is incorrect to use ptep_clear_flush() to nuke a hugetlb page table when unmapping or migrating a hugetlb page, and will change to use huge_ptep_clear_flush() instead in the following patches. So this is a preparation patch, which changes the huge_ptep_clear_flush() to return the original pte to help to nuke a hugetlb page table. [baolin.wang@linux.alibaba.com: fix build in several more architectures] Link: https://lkml.kernel.org/r/0009a4cd-2826-e8be-e671-f050d4f18d5d@linux.alibaba.com [sfr@canb.auug.org.au: fixup] Link: https://lkml.kernel.org/r/20220511181531.7f27a5c1@canb.auug.org.au Link: https://lkml.kernel.org/r/cover.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/20f77ddab90baa249bd24504c413189b82acde69.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/cover.1652147571.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/dcf065868cce35bceaf138613ad27f17bb7c0c19.1652147571.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Stephen Rothwell Acked-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: James Bottomley Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Arnd Bergmann Cc: Gerald Schaefer Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 4 ++-- arch/arm64/mm/hugetlbpage.c | 12 +++++------- arch/ia64/include/asm/hugetlb.h | 5 +++-- arch/mips/include/asm/hugetlb.h | 9 ++++++--- arch/parisc/include/asm/hugetlb.h | 5 +++-- arch/powerpc/include/asm/hugetlb.h | 9 ++++++--- arch/s390/include/asm/hugetlb.h | 6 +++--- arch/sh/include/asm/hugetlb.h | 5 +++-- arch/sparc/include/asm/hugetlb.h | 5 +++-- include/asm-generic/hugetlb.h | 4 ++-- 10 files changed, 36 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1242f71937f8..616b2ca8b23e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -39,8 +39,8 @@ extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -extern void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTE_CLEAR extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index cbace1c9e137..ca8e65c94359 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -486,19 +486,17 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } -void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { size_t pgsize; int ncontig; - if (!pte_cont(READ_ONCE(*ptep))) { - ptep_clear_flush(vma, addr, ptep); - return; - } + if (!pte_cont(READ_ONCE(*ptep))) + return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); - clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + return get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } static int __init hugetlbpage_init(void) diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 7e46ebde8c0c..026ead47cd53 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -23,9 +23,10 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, #define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } #include diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index c2144409c0c4..fd69c8808554 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -43,16 +43,19 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + pte_t pte; + /* * clear the huge pte entry firstly, so that the other smp threads will * not get old pte entry after finishing flush_tlb_page and before * setting new huge pte entry */ - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); flush_tlb_page(vma, addr); + return pte; } #define __HAVE_ARCH_HUGE_PTE_NONE diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index a69cf9efb0c1..f7f078c2872c 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -28,9 +28,10 @@ static inline int prepare_hugepage_range(struct file *file, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 6a1a1ac5743b..8a5674fd120d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -43,11 +43,14 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte_t pte; + + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); flush_hugetlb_page(vma, addr); + return pte; } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 32c3fd6e878e..f22beda9e6d5 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -50,10 +50,10 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index ae4de7b89210..4d3ba39e681c 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -21,9 +21,10 @@ static inline int prepare_hugepage_range(struct file *file, } #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } static inline void arch_clear_hugepage_flags(struct page *page) diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 53838a173f62..0a26cca24232 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -21,9 +21,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { + return *ptep; } #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 896f341f614d..a57d667addd2 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -84,10 +84,10 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - ptep_clear_flush(vma, addr, ptep); + return ptep_clear_flush(vma, addr, ptep); } #endif -- cgit v1.2.3 From 5d4af6195c87c6b162b7963e0ad00a214b80d764 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 May 2022 16:48:55 -0700 Subject: mm: rmap: fix CONT-PTE/PMD size hugetlb issue when migration On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb: 2M and 1G, but also CONT-PTE/PMD size: 64K and 32M if a 4K page size specified. When migrating a hugetlb page, we will get the relevant page table entry by huge_pte_offset() only once to nuke it and remap it with a migration pte entry. This is correct for PMD or PUD size hugetlb, since they always contain only one pmd entry or pud entry in the page table. However this is incorrect for CONT-PTE and CONT-PMD size hugetlb, since they can contain several continuous pte or pmd entry with same page table attributes. So we will nuke or remap only one pte or pmd entry for this CONT-PTE/PMD size hugetlb page, which is not expected for hugetlb migration. The problem is we can still continue to modify the subpages' data of a hugetlb page during migrating a hugetlb page, which can cause a serious data consistent issue, since we did not nuke the page table entry and set a migration pte for the subpages of a hugetlb page. To fix this issue, we should change to use huge_ptep_clear_flush() to nuke a hugetlb page table, and remap it with set_huge_pte_at() and set_huge_swap_pte_at() when migrating a hugetlb page, which already considered the CONT-PTE or CONT-PMD size hugetlb. [akpm@linux-foundation.org: fix nommu build] [baolin.wang@linux.alibaba.com: fix build errors for !CONFIG_MMU] Link: https://lkml.kernel.org/r/a4baca670aca637e7198d9ae4543b8873cb224dc.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/ea5abf529f0997b5430961012bfda6166c1efc8c.1652147571.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: James Bottomley Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 11 +++++++++++ mm/rmap.c | 24 ++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 04f0186b089b..0f2894d01333 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1093,6 +1093,17 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } + +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return *ptep; +} + +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/rmap.c b/mm/rmap.c index 94d6b24a1ac2..4e96daff9293 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1926,13 +1926,15 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } } + + /* Nuke the hugetlb page table entry */ + pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + /* Nuke the page table entry. */ + pteval = ptep_clear_flush(vma, address, pvmw.pte); } - /* Nuke the page table entry. */ - pteval = ptep_clear_flush(vma, address, pvmw.pte); - /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); @@ -2017,7 +2019,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pte_t swp_pte; if (arch_unmap_one(mm, vma, address, pteval) < 0) { - set_pte_at(mm, address, pvmw.pte, pteval); + if (folio_test_hugetlb(folio)) + set_huge_pte_at(mm, address, pvmw.pte, pteval); + else + set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -2026,7 +2031,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, !anon_exclusive, subpage); if (anon_exclusive && page_try_share_anon_rmap(subpage)) { - set_pte_at(mm, address, pvmw.pte, pteval); + if (folio_test_hugetlb(folio)) + set_huge_pte_at(mm, address, pvmw.pte, pteval); + else + set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -2052,7 +2060,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, address, pvmw.pte, swp_pte); + if (folio_test_hugetlb(folio)) + set_huge_swap_pte_at(mm, address, pvmw.pte, + swp_pte, vma_mmu_pagesize(vma)); + else + set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), compound_order(&folio->page)); /* -- cgit v1.2.3 From a00a875925a418b030783457595f70ddf855de4b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: rmap: fix CONT-PTE/PMD size hugetlb issue when unmapping On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb: 2M and 1G, but also CONT-PTE/PMD size: 64K and 32M if a 4K page size specified. When unmapping a hugetlb page, we will get the relevant page table entry by huge_pte_offset() only once to nuke it. This is correct for PMD or PUD size hugetlb, since they always contain only one pmd entry or pud entry in the page table. However this is incorrect for CONT-PTE and CONT-PMD size hugetlb, since they can contain several continuous pte or pmd entry with same page table attributes, so we will nuke only one pte or pmd entry for this CONT-PTE/PMD size hugetlb page. And now try_to_unmap() is only passed a hugetlb page in the case where the hugetlb page is poisoned. Which means now we will unmap only one pte entry for a CONT-PTE or CONT-PMD size poisoned hugetlb page, and we can still access other subpages of a CONT-PTE or CONT-PMD size poisoned hugetlb page, which will cause serious issues possibly. So we should change to use huge_ptep_clear_flush() to nuke the hugetlb page table to fix this issue, which already considered CONT-PTE and CONT-PMD size hugetlb. We've already used set_huge_swap_pte_at() to set a poisoned swap entry for a poisoned hugetlb page. Meanwhile adding a VM_BUG_ON() to make sure the passed hugetlb page is poisoned in try_to_unmap(). Link: https://lkml.kernel.org/r/0a2e547238cad5bc153a85c3e9658cb9d55f9cac.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/730ea4b6d292f32fb10b7a4e87dad49b0eb30474.1652147571.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: James Bottomley Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/rmap.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 4e96daff9293..219e287a83d2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1527,6 +1527,11 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { + /* + * The try_to_unmap() is only passed a hugetlb page + * in the case where the hugetlb page is poisoned. + */ + VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may @@ -1562,28 +1567,28 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } } + pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); - } - - /* - * Nuke the page table entry. When having to clear - * PageAnonExclusive(), we always have to flush. - */ - if (should_defer_flush(mm, flags) && !anon_exclusive) { /* - * We clear the PTE but do not flush so potentially - * a remote CPU could still be writing to the folio. - * If the entry was previously clean then the - * architecture must guarantee that a clear->dirty - * transition on a cached TLB entry is written through - * and traps if the PTE is unmapped. + * Nuke the page table entry. When having to clear + * PageAnonExclusive(), we always have to flush. */ - pteval = ptep_get_and_clear(mm, address, pvmw.pte); + if (should_defer_flush(mm, flags) && !anon_exclusive) { + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the folio. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); - } else { - pteval = ptep_clear_flush(vma, address, pvmw.pte); + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pvmw.pte); + } } /* -- cgit v1.2.3 From 0effdf461c5789be02d40c1868c70cc02ea24627 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: hugetlb_vmemmap: disable hugetlb_optimize_vmemmap when struct page crosses page boundaries Patch series "add hugetlb_optimize_vmemmap sysctl", v11. This series aims to add hugetlb_optimize_vmemmap sysctl to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. This patch (of 4): If the size of "struct page" is not the power of two but with the feature of minimizing overhead of struct page associated with each HugeTLB is enabled, then the vmemmap pages of HugeTLB will be corrupted after remapping (panic is about to happen in theory). But this only exists when !CONFIG_MEMCG && !CONFIG_SLUB on x86_64. However, it is not a conventional configuration nowadays. So it is not a real word issue, just the result of a code review. But we cannot prevent anyone from configuring that combined configure. This hugetlb_optimize_vmemmap should be disable in this case to fix this issue. Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 29554c6ef2ae..6254bb2d4ae5 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -28,12 +28,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - /* We cannot optimize if a "struct page" crosses page boundaries. */ - if (!is_power_of_2(sizeof(struct page))) { - pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); - return 0; - } - if (!buf) return -EINVAL; @@ -119,6 +113,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!hugetlb_optimize_vmemmap_enabled()) return; + if (!is_power_of_2(sizeof(struct page))) { + pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* * The head page is not to be freed to buddy allocator, the other tail -- cgit v1.2.3 From 6e02c46b4d970f24eb51197d451b00f08a8a4186 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: memory_hotplug: override memmap_on_memory when hugetlb_free_vmemmap=on Optimizing HugeTLB vmemmap pages is not compatible with allocating memmap on hot added memory. If "hugetlb_free_vmemmap=on" and memory_hotplug.memmap_on_memory" are both passed on the kernel command line, optimizing hugetlb pages takes precedence. However, the global variable memmap_on_memory will still be set to 1, even though we will not try to allocate memmap on hot added memory. Also introduce mhp_memmap_on_memory() helper to move the definition of "memmap_on_memory" to the scope of CONFIG_MHP_MEMMAP_ON_MEMORY. In the next patch, mhp_memmap_on_memory() will also be exported to be used in hugetlb_vmemmap.c. Link: https://lkml.kernel.org/r/20220512041142.39501-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 945191708ef6..6951bf0bc2e6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,14 +42,36 @@ #include "internal.h" #include "shuffle.h" +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +static int memmap_on_memory_set(const char *val, const struct kernel_param *kp) +{ + if (hugetlb_optimize_vmemmap_enabled()) + return 0; + return param_set_bool(val, kp); +} + +static const struct kernel_param_ops memmap_on_memory_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = memmap_on_memory_set, + .get = param_get_bool, +}; /* * memory_hotplug.memmap_on_memory parameter */ static bool memmap_on_memory __ro_after_init; -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -module_param(memmap_on_memory, bool, 0444); +module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); + +static inline bool mhp_memmap_on_memory(void) +{ + return memmap_on_memory; +} +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} #endif enum { @@ -1289,9 +1311,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - return memmap_on_memory && - !hugetlb_optimize_vmemmap_enabled() && - IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && + return mhp_memmap_on_memory() && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); @@ -2076,7 +2096,7 @@ static int __ref try_remove_memory(u64 start, u64 size) * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in * the same granularity it was added - a single memory block. */ - if (memmap_on_memory) { + if (mhp_memmap_on_memory()) { nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, get_nr_vmemmap_pages_cb); if (nr_vmemmap_pages) { -- cgit v1.2.3 From 9c54c522bb76cbef480722bd44059e2ba8304bd2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: hugetlb_vmemmap: use kstrtobool for hugetlb_vmemmap param parsing Use kstrtobool rather than open coding "on" and "off" parsing in mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 6 +++--- mm/hugetlb_vmemmap.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d9b577db6e19..7e079b6994fe 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1664,10 +1664,10 @@ enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { on | off (default) } + Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } - on: enable the feature - off: disable the feature + [oO][Nn]/Y/y/1: enable the feature + [oO][Ff]/N/n/0: disable the feature Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 6254bb2d4ae5..cc4ec752ec16 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -28,15 +28,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - if (!buf) + bool enable; + + if (kstrtobool(buf, &enable)) return -EINVAL; - if (!strcmp(buf, "on")) + if (enable) static_branch_enable(&hugetlb_optimize_vmemmap_key); - else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_optimize_vmemmap_key); else - return -EINVAL; + static_branch_disable(&hugetlb_optimize_vmemmap_key); return 0; } -- cgit v1.2.3 From 78f39084b41d287aedb2ea55f2c1895cfa11d61a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases. 1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment. 2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop. 3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time. Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Oscar Salvador Cc: David Hildenbrand Cc: Masahiro Yamada Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/vm.rst | 39 ++++++++++++++ include/linux/memory_hotplug.h | 9 ++++ mm/hugetlb_vmemmap.c | 93 +++++++++++++++++++++++++++++---- mm/memory_hotplug.c | 7 +-- 4 files changed, 133 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 747e325ebcd0..5c9aa171a0d3 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -562,6 +562,45 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst +hugetlb_optimize_vmemmap +======================== + +This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) +is configured or the size of 'struct page' (a structure defined in +include/linux/mm_types.h) is not power of two (an unusual system config could +result in this). + +Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages +associated with each HugeTLB page. + +Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages +per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be +optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool +to the buddy allocator, the vmemmap pages representing that range needs to be +remapped again and the vmemmap pages discarded earlier need to be rellocated +again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g. +never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set +'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on +the fly') instead of being pulled from the HugeTLB pool, you should weigh the +benefits of memory savings against the more overhead (~2x slower than before) +of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy +allocator. Another behavior to note is that if the system is under heavy memory +pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB +pool to the buddy allocator since the allocation of vmemmap pages could be +failed, you have to retry later if your system encounter this situation. + +Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will not be optimized meaning the extra overhead at allocation +time from buddy allocator disappears, whereas already optimized HugeTLB pages +will not be affected. If you want to make sure there are no optimized HugeTLB +pages, you can set "nr_hugepages" to 0 first and then disable this. Note that +writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus +pages. So, those surplus pages are still optimized until they are no longer +in use. You would need to wait for those surplus pages to be released before +there are no optimized pages in the system. + + nr_hugepages_mempolicy ====================== diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..20d7edf62a6a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,4 +351,13 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +bool mhp_memmap_on_memory(void); +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} +#endif + #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index cc4ec752ec16..fcd9f7872064 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -10,6 +10,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt +#include #include "hugetlb_vmemmap.h" /* @@ -22,21 +23,40 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) +enum vmemmap_optimize_mode { + VMEMMAP_OPTIMIZE_OFF, + VMEMMAP_OPTIMIZE_ON, +}; + DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static enum vmemmap_optimize_mode vmemmap_optimize_mode = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + +static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +{ + if (vmemmap_optimize_mode == to) + return; + + if (to == VMEMMAP_OPTIMIZE_OFF) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + static_branch_inc(&hugetlb_optimize_vmemmap_key); + WRITE_ONCE(vmemmap_optimize_mode, to); +} + static int __init hugetlb_vmemmap_early_param(char *buf) { bool enable; + enum vmemmap_optimize_mode mode; if (kstrtobool(buf, &enable)) return -EINVAL; - if (enable) - static_branch_enable(&hugetlb_optimize_vmemmap_key); - else - static_branch_disable(&hugetlb_optimize_vmemmap_key); + mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; + vmemmap_optimize_mode_switch(mode); return 0; } @@ -69,8 +89,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) + if (!ret) { ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + } return ret; } @@ -84,6 +106,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) if (!vmemmap_pages) return; + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return; + + static_branch_inc(&hugetlb_optimize_vmemmap_key); + vmemmap_addr += RESERVE_VMEMMAP_SIZE; vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); vmemmap_reuse = vmemmap_addr - PAGE_SIZE; @@ -93,7 +120,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) * to the page which @vmemmap_reuse is mapped to, then free the pages * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ - if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else SetHPageVmemmapOptimized(head); } @@ -110,9 +139,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_optimize_vmemmap_enabled()) - return; - if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -134,3 +160,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h) pr_info("can optimize %d vmemmap pages for %s\n", h->optimize_vmemmap_pages, h->name); } + +#ifdef CONFIG_PROC_SYSCTL +static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + enum vmemmap_optimize_mode mode; + static DEFINE_MUTEX(sysctl_mutex); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sysctl_mutex); + mode = vmemmap_optimize_mode; + table->data = &mode; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (write && !ret) + vmemmap_optimize_mode_switch(mode); + mutex_unlock(&sysctl_mutex); + + return ret; +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .maxlen = sizeof(enum vmemmap_optimize_mode), + .mode = 0644, + .proc_handler = hugetlb_optimize_vmemmap_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int hugetlb_vmemmap_sysctls_init(void) +{ + /* + * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" + * crosses page boundaries, the vmemmap pages cannot be optimized. + */ + if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page))) + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + + return 0; +} +late_initcall(hugetlb_vmemmap_sysctls_init); +#endif /* CONFIG_PROC_SYSCTL */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6951bf0bc2e6..1213d0c67a53 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -63,15 +63,10 @@ static bool memmap_on_memory __ro_after_init; module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); -static inline bool mhp_memmap_on_memory(void) +bool mhp_memmap_on_memory(void) { return memmap_on_memory; } -#else -static inline bool mhp_memmap_on_memory(void) -{ - return false; -} #endif enum { -- cgit v1.2.3 From 8e20d4b332660a32e842e20c34cfc3b3456bc6dc Mon Sep 17 00:00:00 2001 From: Ganesan Rajagopal Date: Fri, 13 May 2022 16:48:57 -0700 Subject: mm/memcontrol: export memcg->watermark via sysfs for v2 memcg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We run a lot of automated tests when building our software and run into OOM scenarios when the tests run unbounded. v1 memcg exports memcg->watermark as "memory.max_usage_in_bytes" in sysfs. We use this metric to heuristically limit the number of tests that can run in parallel based on per test historical data. This metric is currently not exported for v2 memcg and there is no other easy way of getting this information. getrusage() syscall returns "ru_maxrss" which can be used as an approximation but that's the max RSS of a single child process across all children instead of the aggregated max for all child processes. The only work around is to periodically poll "memory.current" but that's not practical for short-lived one-off cgroups. Hence, expose memcg->watermark as "memory.peak" for v2 memcg. Link: https://lkml.kernel.org/r/20220507050916.GA13577@us192.sjc.aristanetworks.com Signed-off-by: Ganesan Rajagopal Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Roman Gushchin Reviewed-by: Michal Koutný Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 7 +++++++ mm/memcontrol.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 19bcd73cad03..07ef3bf1448e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1229,6 +1229,13 @@ PAGE_SIZE multiple when read back. the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. + memory.peak + A read-only single value file which exists on non-root + cgroups. + + The max memory usage recorded for the cgroup and its + descendants since the creation of the cgroup. + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e1b5823ac060..ef76df7c6d12 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6103,6 +6103,14 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static u64 memory_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)memcg->memory.watermark * PAGE_SIZE; +} + static int memory_min_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -6406,6 +6414,11 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, + { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = memory_peak_read, + }, { .name = "min", .flags = CFTYPE_NOT_ON_ROOT, -- cgit v1.2.3 From d4a157f5a26fbf1f1fd5da47a4772a738306348f Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Fri, 13 May 2022 16:48:57 -0700 Subject: mm/damon: add documentation for Enum value Fix the warning - "Enum value 'NR_DAMON_OPS' not described in enum 'damon_ops_id'" generated by the command "make pdfdocs" Link: https://lkml.kernel.org/r/20220508073316.141401-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index d1e6ee28a2ff..7c62da31ce4b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -264,6 +264,7 @@ struct damos { * @DAMON_OPS_FVADDR: Monitoring operations for only fixed ranges of virtual * address spaces * @DAMON_OPS_PADDR: Monitoring operations for the physical address space + * @NR_DAMON_OPS: Number of monitoring operations implementations */ enum damon_ops_id { DAMON_OPS_VADDR, -- cgit v1.2.3 From bbe832b9db2e1ad21522f8f0bf02775fff8a0e0e Mon Sep 17 00:00:00 2001 From: Rei Yamamoto Date: Fri, 13 May 2022 16:48:57 -0700 Subject: mm, compaction: fast_find_migrateblock() should return pfn in the target zone At present, pages not in the target zone are added to cc->migratepages list in isolate_migratepages_block(). As a result, pages may migrate between nodes unintentionally. This would be a serious problem for older kernels without commit a984226f457f849e ("mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec"), because it can corrupt the lru list by handling pages in list without holding proper lru_lock. Avoid returning a pfn outside the target zone in the case that it is not aligned with a pageblock boundary. Otherwise isolate_migratepages_block() will handle pages not in the target zone. Link: https://lkml.kernel.org/r/20220511044300.4069-1-yamamoto.rei@jp.fujitsu.com Fixes: 70b44595eafe ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: Rei Yamamoto Reviewed-by: Miaohe Lin Acked-by: Mel Gorman Reviewed-by: Oscar Salvador Cc: Don Dutile Cc: Wonhyuk Yang Cc: Rei Yamamoto Cc: Signed-off-by: Andrew Morton --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 0e0ef9e44530..1f89b969c12b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1848,6 +1848,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); + if (pfn < cc->zone->zone_start_pfn) + pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; set_pageblock_skip(freepage); -- cgit v1.2.3 From 2c8a81dc0cc533c666c64f4139a608b0ae55d96f Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Tue, 17 May 2022 07:45:47 +0000 Subject: riscv/mm: fix two page table check related issues Two page table check related issues have been fixed here. 1. Open CONFIG_PAGE_TABLE_CHECK in riscv32, we got a compile error[1]: error: implicit declaration of function 'pud_leaf' Add pud_leaf() definition to incluce/asm-generic/pgtable-nopmd.h to fix this issue. 2. Keep consistent with other pud_xxx() helpers, move pud_user() to pgtable-64.h and add pud_user() to pgtable-nopmd.h. [1]https://lore.kernel.org/linux-mm/202205161811.2nLxmN2O-lkp@intel.com/T/ Link: https://lkml.kernel.org/r/20220517074548.2227779-2-tongtiangen@huawei.com Fixes: 856eed79f8d3 ("riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") Signed-off-by: Tong Tiangen Reported-by: kernel test robot Cc: Anshuman Khandual Cc: Pasha Tatashin Cc: Anshuman Khandual Cc: Albert Ou Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Guohanjun Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Cc: Xie XiuQi Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable-64.h | 5 +++++ arch/riscv/include/asm/pgtable.h | 5 ----- include/asm-generic/pgtable-nopmd.h | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 7e246e9f8d70..ba2494cbc24f 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -86,6 +86,11 @@ static inline int pud_leaf(pud_t pud) return pud_present(pud) && (pud_val(pud) & _PAGE_LEAF); } +static inline int pud_user(pud_t pud) +{ + return pud_val(pud) & _PAGE_USER; +} + static inline void set_pud(pud_t *pudp, pud_t pud) { *pudp = pud; diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 62e733c85836..4200ddede880 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -634,11 +634,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } -static inline int pud_user(pud_t pud) -{ - return pte_user(pud_pte(pud)); -} - static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index 10789cf51d16..8ffd64e7a24c 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -30,6 +30,8 @@ typedef struct { pud_t pud; } pmd_t; static inline int pud_none(pud_t pud) { return 0; } static inline int pud_bad(pud_t pud) { return 0; } static inline int pud_present(pud_t pud) { return 1; } +static inline int pud_user(pud_t pud) { return 0; } +static inline int pud_leaf(pud_t pud) { return 0; } static inline void pud_clear(pud_t *pud) { } #define pmd_ERROR(pmd) (pud_ERROR((pmd).pud)) -- cgit v1.2.3 From ed928a3402d8a24a70c242c63109c069a7b1f3ab Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Tue, 17 May 2022 07:45:48 +0000 Subject: arm64/mm: fix page table check compile error for CONFIG_PGTABLE_LEVELS=2 If CONFIG_PGTABLE_LEVELS=2 and CONFIG_ARCH_SUPPORTS_PAGE_TABLE_CHECK=y, then we trigger a compile error: error: implicit declaration of function 'pte_user_accessible_page' Move the definition of page table check helper out of branch CONFIG_PGTABLE_LEVELS > 2 Link: https://lkml.kernel.org/r/20220517074548.2227779-3-tongtiangen@huawei.com Fixes: daf214c14dbe ("arm64/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") Signed-off-by: Tong Tiangen Acked-by: Catalin Marinas Cc: Anshuman Khandual Cc: Pasha Tatashin Cc: Anshuman Khandual Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Arnd Bergmann Cc: Guohanjun Cc: Xie XiuQi Cc: kernel test robot Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4e61cde27f9f..979601528f1e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -667,22 +667,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_valid(pud) pte_valid(pud_pte(pud)) #define pud_user(pud) pte_user(pud_pte(pud)) -#ifdef CONFIG_PAGE_TABLE_CHECK -static inline bool pte_user_accessible_page(pte_t pte) -{ - return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte)); -} - -static inline bool pmd_user_accessible_page(pmd_t pmd) -{ - return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); -} - -static inline bool pud_user_accessible_page(pud_t pud) -{ - return pud_present(pud) && pud_user(pud); -} -#endif static inline void set_pud(pud_t *pudp, pud_t pud) { @@ -855,6 +839,23 @@ static inline int pgd_devmap(pgd_t pgd) } #endif +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) +{ + return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte)); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_present(pud) && pud_user(pud); +} +#endif + /* * Atomic pte/pmd modifications. */ -- cgit v1.2.3 From b265cdebdfefbee4ef1ae2972af36f7a7cac5148 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:48 -0700 Subject: sched: coredump.h: clarify the use of MMF_VM_HUGEPAGE Patch series "Make khugepaged collapse readonly FS THP more consistent", v4. The readonly FS THP relies on khugepaged to collapse THP for suitable vmas. But the behavior is inconsistent for "always" mode (https://lore.kernel.org/linux-mm/00f195d4-d039-3cf2-d3a1-a2c88de397a0@suse.cz/). The "always" mode means THP allocation should be tried all the time and khugepaged should try to collapse THP all the time. Of course the allocation and collapse may fail due to other factors and conditions. Currently file THP may not be collapsed by khugepaged even though all the conditions are met. That does break the semantics of "always" mode. So make sure readonly FS vmas are registered to khugepaged to fix the break. Register suitable vmas in common mmap path, that could cover both readonly FS vmas and shmem vmas, so remove the khugepaged calls in shmem.c. The patch 1-7 are minor bug fixes, clean up and preparation patches. Patch 8 is the real meat. Tested with khugepaged test in selftests and the testcase provided by Vlastimil Babka in https://lore.kernel.org/lkml/df3b5d1c-a36b-2c73-3e27-99e74983de3a@suse.cz/ by commenting out MADV_HUGEPAGE call. This patch (of 8): MMF_VM_HUGEPAGE is set as long as the mm is available for khugepaged by khugepaged_enter(), not only when VM_HUGEPAGE is set on vma. Correct the comment to avoid confusion. Link: https://lkml.kernel.org/r/20220510203222.24246-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20220510203222.24246-2-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Miaohe Lin Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d9e3a656875..4d0a5be28b70 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -57,7 +57,8 @@ static inline int get_dumpable(struct mm_struct *mm) #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ +#define MMF_VM_HUGEPAGE 17 /* set when mm is available for + khugepaged */ /* * This one-shot flag is dropped due to necessity of changing exe once again * on NFS restore -- cgit v1.2.3 From cb648754a1d0dbac7cbcfe8d42cb27fa6063547d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: remove redundant check for VM_NO_KHUGEPAGED The hugepage_vma_check() called by khugepaged_enter_vma_merge() does check VM_NO_KHUGEPAGED. Remove the check from caller and move the check in hugepage_vma_check() up. More checks may be run for VM_NO_KHUGEPAGED vmas, but MADV_HUGEPAGE is definitely not a hot path, so cleaner code does outweigh. Link: https://lkml.kernel.org/r/20220510203222.24246-3-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Miaohe Lin Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2243ed095f02..e53b4826c7bd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -365,8 +365,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (!(*vm_flags & VM_NO_KHUGEPAGED) && - khugepaged_enter_vma_merge(vma, *vm_flags)) + if (khugepaged_enter_vma_merge(vma, *vm_flags)) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -445,6 +444,9 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, if (!transhuge_vma_enabled(vma, vm_flags)) return false; + if (vm_flags & VM_NO_KHUGEPAGED) + return false; + if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; @@ -470,7 +472,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, return false; if (vma_is_temporary_stack(vma)) return false; - return !(vm_flags & VM_NO_KHUGEPAGED); + + return true; } int __khugepaged_enter(struct mm_struct *mm) -- cgit v1.2.3 From 52b52bf15b67213c631dc8bee2a1854dd8811110 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: skip DAX vma The DAX vma may be seen by khugepaged when the mm has other khugepaged suitable vmas. So khugepaged may try to collapse THP for DAX vma, but it will fail due to page sanity check, for example, page is not on LRU. So it is not harmful, but it is definitely pointless to run khugepaged against DAX vma, so skip it in early check. Link: https://lkml.kernel.org/r/20220510203222.24246-4-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Miaohe Lin Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e53b4826c7bd..647efd46b369 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -447,6 +447,10 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, if (vm_flags & VM_NO_KHUGEPAGED) return false; + /* Don't run khugepaged against DAX vma */ + if (vma_is_dax(vma)) + return false; + if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; -- cgit v1.2.3 From 78d12c19e02d8151b1c82228ae6bb10f174a68e2 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: thp: only regular file could be THP eligible Since commit a4aeaa06d45e ("mm: khugepaged: skip huge page collapse for special files"), khugepaged just collapses THP for regular file which is the intended usecase for readonly fs THP. Only show regular file as THP eligible accordingly. And make file_thp_enabled() available for khugepaged too in order to remove duplicate code. Link: https://lkml.kernel.org/r/20220510203222.24246-5-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 14 ++++++++++++++ mm/huge_memory.c | 11 ++--------- mm/khugepaged.c | 9 ++------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fbf36bb1be22..de29821231c9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -173,6 +173,20 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + struct inode *inode; + + if (!vma->vm_file) + return false; + + inode = vma->vm_file->f_inode; + + return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && + (vma->vm_flags & VM_EXEC) && + !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); +} + bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c37c870c526f..1a00bdf0f1e8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,13 +69,6 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -static inline bool file_thp_enabled(struct vm_area_struct *vma) -{ - return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file && - !inode_is_open_for_write(vma->vm_file->f_inode) && - (vma->vm_flags & VM_EXEC); -} - bool transparent_hugepage_active(struct vm_area_struct *vma) { /* The addr is used to check if the vma size fits */ @@ -87,8 +80,8 @@ bool transparent_hugepage_active(struct vm_area_struct *vma) return __transparent_hugepage_enabled(vma); if (vma_is_shmem(vma)) return shmem_huge_enabled(vma); - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) - return file_thp_enabled(vma); + if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + return true; return false; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 647efd46b369..1f4822dcc970 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -464,13 +464,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, return false; /* Only regular file is valid */ - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && - (vm_flags & VM_EXEC)) { - struct inode *inode = vma->vm_file->f_inode; - - return !inode_is_open_for_write(inode) && - S_ISREG(inode->i_mode); - } + if (file_thp_enabled(vma)) + return true; if (!vma->anon_vma || !vma_is_anonymous(vma)) return false; -- cgit v1.2.3 From d2081b2bf8195b8239c67fdd61518e077da7cbec Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: make khugepaged_enter() void function The most callers of khugepaged_enter() don't care about the return value. Only dup_mmap(), anonymous THP page fault and MADV_HUGEPAGE handle the error by returning -ENOMEM. Actually it is not harmful for them to ignore the error case either. It also sounds overkilling to fail fork() and page fault early due to khugepaged_enter() error, and MADV_HUGEPAGE does set VM_HUGEPAGE flag regardless of the error. Link: https://lkml.kernel.org/r/20220510203222.24246-6-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 30 ++++++++++++------------------ kernel/fork.c | 4 +--- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 18 +++++++----------- 4 files changed, 22 insertions(+), 34 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 2fcc01891b47..0423d3619f26 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -12,10 +12,10 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags); +extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); @@ -40,11 +40,10 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm, (transparent_hugepage_flags & \ (1<flags)) - return __khugepaged_enter(mm); - return 0; + __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) @@ -53,7 +52,7 @@ static inline void khugepaged_exit(struct mm_struct *mm) __khugepaged_exit(mm); } -static inline int khugepaged_enter(struct vm_area_struct *vma, +static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) @@ -62,27 +61,22 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && !(vm_flags & VM_NOHUGEPAGE) && !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - if (__khugepaged_enter(vma->vm_mm)) - return -ENOMEM; - return 0; + __khugepaged_enter(vma->vm_mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - return 0; } static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline int khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { - return 0; } -static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { - return 0; } static inline void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) diff --git a/kernel/fork.c b/kernel/fork.c index 9796897560ab..0d13baf86650 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -612,9 +612,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = ksm_fork(mm, oldmm); if (retval) goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; + khugepaged_fork(mm, oldmm); prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a00bdf0f1e8..efd06af61fca 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -726,8 +726,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma, vma->vm_flags))) - return VM_FAULT_OOM; + khugepaged_enter(vma, vma->vm_flags); + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1f4822dcc970..b43b294ca5f8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -365,8 +365,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (khugepaged_enter_vma_merge(vma, *vm_flags)) - return -ENOMEM; + khugepaged_enter_vma_merge(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; @@ -475,20 +474,20 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, return true; } -int __khugepaged_enter(struct mm_struct *mm) +void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; int wakeup; mm_slot = alloc_mm_slot(); if (!mm_slot) - return -ENOMEM; + return; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); - return 0; + return; } spin_lock(&khugepaged_mm_lock); @@ -504,11 +503,9 @@ int __khugepaged_enter(struct mm_struct *mm) mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); - - return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma, +void khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { unsigned long hstart, hend; @@ -519,13 +516,12 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * file-private shmem THP is not supported. */ if (!hugepage_vma_check(vma, vm_flags)) - return 0; + return; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma, vm_flags); - return 0; + khugepaged_enter(vma, vm_flags); } void __khugepaged_exit(struct mm_struct *mm) -- cgit v1.2.3 From 2647d11b9e71d495b2d4985ebaf7a734373a4b80 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: make hugepage_vma_check() non-static The hugepage_vma_check() could be reused by khugepaged_enter() and khugepaged_enter_vma_merge(), but it is static in khugepaged.c. Make it non-static and declare it in khugepaged.h. Link: https://lkml.kernel.org/r/20220510203222.24246-7-shy828301@gmail.com Signed-off-by: Yang Shi Suggested-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 14 ++++++-------- mm/khugepaged.c | 25 +++++++++---------------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 0423d3619f26..c340f6bb39d6 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -3,8 +3,6 @@ #define _LINUX_KHUGEPAGED_H #include /* MMF_VM_HUGEPAGE */ -#include - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -12,6 +10,8 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); +extern bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, @@ -55,13 +55,11 @@ static inline void khugepaged_exit(struct mm_struct *mm) static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) - if ((khugepaged_always() || - (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) || - (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE) && - !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + khugepaged_enabled()) { + if (hugepage_vma_check(vma, vm_flags)) __khugepaged_enter(vma->vm_mm); + } } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b43b294ca5f8..9ef626e2c750 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -437,8 +437,8 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -static bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags) +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags) { if (!transhuge_vma_enabled(vma, vm_flags)) return false; @@ -508,20 +508,13 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { - unsigned long hstart, hend; - - /* - * khugepaged only supports read-only files for non-shmem files. - * khugepaged does not yet work on special mappings. And - * file-private shmem THP is not supported. - */ - if (!hugepage_vma_check(vma, vm_flags)) - return; - - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart < hend) - khugepaged_enter(vma, vm_flags); + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + khugepaged_enabled() && + (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK))) { + if (hugepage_vma_check(vma, vm_flags)) + __khugepaged_enter(vma->vm_mm); + } } void __khugepaged_exit(struct mm_struct *mm) -- cgit v1.2.3 From c791576c60288c89b351ea2d2098f6a872d78fa7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm: khugepaged: introduce khugepaged_enter_vma() helper The khugepaged_enter_vma_merge() actually does as the same thing as the khugepaged_enter() section called by shmem_mmap(), so consolidate them into one helper and rename it to khugepaged_enter_vma(). Link: https://lkml.kernel.org/r/20220510203222.24246-8-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 8 ++++---- mm/khugepaged.c | 6 +++--- mm/mmap.c | 8 ++++---- mm/shmem.c | 12 ++---------- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index c340f6bb39d6..392d34c3c59a 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -14,8 +14,8 @@ extern bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags); +extern void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); @@ -72,8 +72,8 @@ static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { } -static inline void khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags) { } static inline void collapse_pte_mapped_thp(struct mm_struct *mm, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9ef626e2c750..16be62d493cd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -365,7 +365,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - khugepaged_enter_vma_merge(vma, *vm_flags); + khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; @@ -505,8 +505,8 @@ void __khugepaged_enter(struct mm_struct *mm) wake_up_interruptible(&khugepaged_wait); } -void khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && khugepaged_enabled() && diff --git a/mm/mmap.c b/mm/mmap.c index 7f7d982721b9..4456bf81e11e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1223,7 +1223,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL, prev); if (err) return NULL; - khugepaged_enter_vma_merge(prev, vm_flags); + khugepaged_enter_vma(prev, vm_flags); return prev; } @@ -1250,7 +1250,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } if (err) return NULL; - khugepaged_enter_vma_merge(area, vm_flags); + khugepaged_enter_vma(area, vm_flags); return area; } @@ -2450,7 +2450,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma_merge(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); validate_mm(mm); return error; } @@ -2528,7 +2528,7 @@ int expand_downwards(struct vm_area_struct *vma, } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma_merge(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); validate_mm(mm); return error; } diff --git a/mm/shmem.c b/mm/shmem.c index 29701be579f8..89f6f4fec3f9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2232,11 +2232,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK)) { - khugepaged_enter(vma, vma->vm_flags); - } + khugepaged_enter_vma(vma, vma->vm_flags); return 0; } @@ -4137,11 +4133,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK)) { - khugepaged_enter(vma, vma->vm_flags); - } + khugepaged_enter_vma(vma, vma->vm_flags); return 0; } -- cgit v1.2.3 From 613bec092fe78307a8b130353ce1ef340915587f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm: mmap: register suitable readonly file vmas for khugepaged The readonly FS THP relies on khugepaged to collapse THP for suitable vmas. But the behavior is inconsistent for "always" mode (https://lore.kernel.org/linux-mm/00f195d4-d039-3cf2-d3a1-a2c88de397a0@suse.cz/). The "always" mode means THP allocation should be tried all the time and khugepaged should try to collapse THP all the time. Of course the allocation and collapse may fail due to other factors and conditions. Currently file THP may not be collapsed by khugepaged even though all the conditions are met. That does break the semantics of "always" mode. So make sure readonly FS vmas are registered to khugepaged to fix the break. Register suitable vmas in common mmap path, that could cover both readonly FS vmas and shmem vmas, so remove the khugepaged calls in shmem.c. Still need to keep the khugepaged call in vma_merge() since vma_merge() is called in a lot of places, for example, madvise, mprotect, etc. Link: https://lkml.kernel.org/r/20220510203222.24246-9-shy828301@gmail.com Signed-off-by: Yang Shi Reported-by: Vlastimil Babka Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Miaohe Lin Cc: Song Liu Cc: Rik van Riel Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Cc: Theodore Ts'o Cc: Song Liu Signed-off-by: Andrew Morton --- mm/mmap.c | 7 +++++++ mm/shmem.c | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4456bf81e11e..2b9305ed0dda 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1847,6 +1847,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } vma_link(mm, vma, prev, rb_link, rb_parent); + + /* + * vma_merge() calls khugepaged_enter_vma() either, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + /* Once vma denies write, undo our temporary denial count */ unmap_writable: if (file && vm_flags & VM_SHARED) diff --git a/mm/shmem.c b/mm/shmem.c index 89f6f4fec3f9..67a3f3b05fb2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -2232,7 +2231,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &shmem_vm_ops; - khugepaged_enter_vma(vma, vma->vm_flags); return 0; } @@ -4133,8 +4131,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - khugepaged_enter_vma(vma, vma->vm_flags); - return 0; } -- cgit v1.2.3 From 92bafb20b2edd1ab5022a9bce6fb73ed9a749453 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm/swap: use helper is_swap_pte() in swap_vma_readahead Patch series "A few cleanup patches for swap". This series contains a few patches to fix the comment, remove unneeded return value, use some helpers and so on. More details can be found in the respective changelogs. This patch (of 14): Use helper is_swap_pte() to check whether pte is swap entry to make code more clear. Minor readability improvement. Link: https://lkml.kernel.org/r/20220509131416.17553-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220509131416.17553-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: David Howells Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/swap_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 416aaaa8a7ed..4667fa406b14 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -818,9 +818,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; i++, pte++) { pentry = *pte; - if (pte_none(pentry)) - continue; - if (pte_present(pentry)) + if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) -- cgit v1.2.3 From 6106b93efad1ea14d428558c2e4efc5f76874c23 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm/swap: use helper macro __ATTR_RW Use helper macro __ATTR_RW to define vma_ra_enabled_attr to make code more clear. Minor readability improvement. Link: https://lkml.kernel.org/r/20220509131416.17553-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swap_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4667fa406b14..b9e4ed2e90bf 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -884,9 +884,7 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj, return count; } -static struct kobj_attribute vma_ra_enabled_attr = - __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, - vma_ra_enabled_store); +static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, -- cgit v1.2.3 From afba72b17139057aedf7eb2dfe5f6ea6700ab998 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm/swap: fold __swap_info_get() into its sole caller Fold __swap_info_get() into its sole caller to make code more clear. Minor readability improvement. Link: https://lkml.kernel.org/r/20220509131416.17553-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swapfile.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index bfa41bf5e190..570287b786cc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1122,7 +1122,7 @@ noswap: return n_ret; } -static struct swap_info_struct *__swap_info_get(swp_entry_t entry) +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset; @@ -1137,8 +1137,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; + if (data_race(!p->swap_map[swp_offset(entry)])) + goto bad_free; return p; +bad_free: + pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); + goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; @@ -1151,23 +1156,6 @@ out: return NULL; } -static struct swap_info_struct *_swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = __swap_info_get(entry); - if (!p) - goto out; - if (data_race(!p->swap_map[swp_offset(entry)])) - goto bad_free; - return p; - -bad_free: - pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); -out: - return NULL; -} - static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { -- cgit v1.2.3 From bc4a68adb151f08b50822158c7767f5726346370 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm/swap: remove unneeded return value of free_swap_slot The return value of free_swap_slot is always 0 and also ignored now. Remove it to clean up the code. Link: https://lkml.kernel.org/r/20220509131416.17553-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/swap_slots.h | 2 +- mm/swap_slots.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 347f1a304190..15adfb8c813a 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -24,7 +24,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -int free_swap_slot(swp_entry_t entry); +void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0218ec1cd24c..2f877e6f87d7 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -269,7 +269,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) return cache->nr; } -int free_swap_slot(swp_entry_t entry) +void free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; @@ -297,8 +297,6 @@ int free_swap_slot(swp_entry_t entry) direct_free: swapcache_free_entries(&entry, 1); } - - return 0; } swp_entry_t folio_alloc_swap(struct folio *folio) -- cgit v1.2.3 From 23b230ba8ac30d334afd249c9763e0d0ca583e43 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:51 -0700 Subject: mm/swap: print bad swap offset entry in get_swap_device If offset exceeds the si->max, print bad swap offset entry to help debug the unexpected case. Link: https://lkml.kernel.org/r/20220509131416.17553-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swapfile.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 570287b786cc..f59abca5cea9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1271,6 +1271,7 @@ bad_nofile: out: return NULL; put_out: + pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } -- cgit v1.2.3 From f19c25684c3e19b329e0498dd6d601df13c33223 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:51 -0700 Subject: mm/swap: remove buggy cache->nr check in refill_swap_slots_cache refill_swap_slots_cache is always called when cache->nr is 0. So remove such buggy and confusing check. Link: https://lkml.kernel.org/r/20220509131416.17553-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swap_slots.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 2f877e6f87d7..2a65a89b5b4d 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -258,7 +258,7 @@ out_unlock: /* called with swap slot cache's alloc lock held */ static int refill_swap_slots_cache(struct swap_slots_cache *cache) { - if (!use_swap_slot_cache || cache->nr) + if (!use_swap_slot_cache) return 0; cache->cur = 0; -- cgit v1.2.3 From dab8dfff49a673a4b070ad3f44534b3c0e88a189 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:51 -0700 Subject: mm/swap: remove unneeded p != NULL check in __swap_duplicate If p is NULL, __swap_duplicate will already return -EINVAL. So if we reach here, p must be non-NULL. Link: https://lkml.kernel.org/r/20220509131416.17553-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f59abca5cea9..e67addac61e2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3333,8 +3333,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); - if (p) - put_swap_device(p); + put_swap_device(p); return err; } -- cgit v1.2.3 From 3db3264d8a5f4816c022eb9052694ce51e657bfc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:51 -0700 Subject: mm/swap: make page_swapcount and __lru_add_drain_all static Make page_swapcount and __lru_add_drain_all static. They are only used within the file now. Link: https://lkml.kernel.org/r/20220509131416.17553-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ------- mm/swap.c | 2 +- mm/swapfile.c | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6bf1506e5080..0ff887cad383 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -485,7 +485,6 @@ int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); -extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); @@ -557,12 +556,6 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp) { } - -static inline int page_swapcount(struct page *page) -{ - return 0; -} - static inline int __swap_count(swp_entry_t entry) { return 0; diff --git a/mm/swap.c b/mm/swap.c index 7e320ec08c6a..6d2c37f781f8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -748,7 +748,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ -inline void __lru_add_drain_all(bool force_all_cpus) +static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number diff --git a/mm/swapfile.c b/mm/swapfile.c index e67addac61e2..f64b298f1034 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1429,7 +1429,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -int page_swapcount(struct page *page) +static int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; -- cgit v1.2.3 From eacde32757c7566d3aa760609585c78909532e40 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:51 -0700 Subject: mm/swap: avoid calling swp_swap_info when try to check SWP_STABLE_WRITES Use flags of si directly to check SWP_STABLE_WRITES to avoid possible READ_ONCE and thus save some cpu cycles. [akpm@linux-foundation.org: use data_race() on si->flags, per Neil] Link: https://lkml.kernel.org/r/20220509131416.17553-10-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index f4b2c05707b9..2bf5bca39567 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3889,7 +3889,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ exclusive = true; } else if (exclusive && PageWriteback(page) && - (swp_swap_info(entry)->flags & SWP_STABLE_WRITES)) { + data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support * concurrent page modifications while under writeback. -- cgit v1.2.3 From 4b9ae8426cb4a9160ae41bc968391f499feece7e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: add helper swap_offset_available() Add helper swap_offset_available() to remove some duplicated codes. Minor readability improvement. [akpm@linux-foundation.org: s/swap_offset_available/swap_offset_available_and_locked/, per Neil] Link: https://lkml.kernel.org/r/20220509131416.17553-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/swapfile.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f64b298f1034..331aa0cc5b9e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -775,6 +775,22 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next) this_cpu_write(*si->cluster_next_cpu, next); } +static bool swap_offset_available_and_locked(struct swap_info_struct *si, + unsigned long offset) +{ + if (data_race(!si->swap_map[offset])) { + spin_lock(&si->lock); + return true; + } + + if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { + spin_lock(&si->lock); + return true; + } + + return false; +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -952,15 +968,8 @@ done: scan: spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); + if (swap_offset_available_and_locked(si, offset)) goto checks; - } - if (vm_swap_full() && - READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - goto checks; - } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -969,15 +978,8 @@ scan: } offset = si->lowest_bit; while (offset < scan_base) { - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); + if (swap_offset_available_and_locked(si, offset)) goto checks; - } - if (vm_swap_full() && - READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - goto checks; - } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; -- cgit v1.2.3 From a930c210c42dec32b031ad7645bd6904701b5297 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: fix the obsolete comment for SWP_TYPE_SHIFT Since commit 3159f943aafd ("xarray: Replace exceptional entries"), there is only one bit of 'type' can be shifted up. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220509131416.17553-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/swapops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 09945342bf76..fe220df499f1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -13,10 +13,10 @@ * get good packing density in that tree, so the index should be dense in * the low-order bits. * - * We arrange the `type' and `offset' fields so that `type' is at the seven + * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for - * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry(). + * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ -- cgit v1.2.3 From 3c3115ad6baddbf526521231ea43d9fff34dfa11 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: clean up the comment of find_next_to_unuse Since commit 10a9c496789f ("mm: simplify try_to_unuse"), frontswap parameter is removed. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220509131416.17553-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 331aa0cc5b9e..0535afc897ab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1990,9 +1990,9 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) } /* - * Scan swap_map (or frontswap_map if frontswap parameter is true) - * from current position to next entry still in use. Return 0 - * if there are no inuse entries after prev till end of the map. + * Scan swap_map from current position to next entry still in use. + * Return 0 if there are no inuse entries after prev till end of + * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) -- cgit v1.2.3 From 133d2743ef93ea8a979832d2ac72fb9d331045f3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: fix the comment of get_kernel_pages If no pages were pinned, 0 is returned in fact. Fix the corresponding comment. [akpm@linux-foundation.org: s/nr_pages/nr_segs/ also, per David, reflow comment] Link: https://lkml.kernel.org/r/20220509131416.17553-15-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 6d2c37f781f8..f3922a96b2e9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -166,10 +166,10 @@ EXPORT_SYMBOL(put_pages_list); * @pages: array that receives pointers to the pages pinned. * Should be at least nr_segs long. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. + * Returns number of pages pinned. This may be fewer than the number requested. + * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0. + * Each page returned must be released with a put_page() call when it is + * finished with. */ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, struct page **pages) -- cgit v1.2.3 From ff351f4bb9606a6c9a1f3fcdd918635b51514779 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: fix comment about swap extent Since commit 4efaceb1c5f8 ("mm, swap: use rbtree for swap_extent"), rbtree is used for swap extent. Also curr_swap_extent is removed at that time. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220509131416.17553-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/swapfile.c | 15 ++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ff887cad383..04c0713c1d6e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -168,8 +168,8 @@ struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of - * disk blocks. A list of swap extents maps the entire swapfile. (Where the - * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart + * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. diff --git a/mm/swapfile.c b/mm/swapfile.c index 0535afc897ab..a0eb690d9926 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2208,8 +2208,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages - * onto a contiguous range of disk blocks. An ordered list of swap extents - * is built at swapon time and is then used at swap_writepage/swap_readpage + * onto a contiguous range of disk blocks. A rbtree of swap extents is + * built at swapon time and is then used at swap_writepage/swap_readpage * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. @@ -2217,12 +2217,12 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap - * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks - * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If - * some stray blocks are found which do not fall within the PAGE_SIZE alignment + * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray + * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * @@ -2231,10 +2231,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of - * extents in the list. To avoid much list walking, we cache the previous - * search location in `curr_swap_extent', and start new searches from there. - * This is extremely effective. The average number of iterations in - * map_swap_page() has been measured at about 0.3 per page. - akpm. + * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { -- cgit v1.2.3 From 39799b6409febf628337bd7804c3861a9f7f7e8a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:52 -0700 Subject: Documentation: filesystems: proc: update meminfo section Patch series "zswap: accounting & cgroup control", v2. Zswap can consume nearly a quarter of RAM in the default configuration, yet it's neither listed in /proc/meminfo, nor is it accounted and manageable on a per-cgroup basis. This makes reasoning about the memory situation on a host in general rather difficult. On shared/cgrouped hosts, the consequences are worse. First, workloads can escape memory containment and cause resource priority inversions: a lo-pri group can fill the global zswap pool and force a hi-pri group out to disk. Second, not all workloads benefit from zswap equally. Some even suffer when memory contents compress poorly, and are better off going to disk swap directly. On a host with mixed workloads, it's currently not possible to enable zswap for one workload but not for the other. This series implements the missing global accounting as well as cgroup tracking & control for zswap backing memory: - Patch 1 refreshes the very out-of-date meminfo documentation in Documentation/filesystems/proc.rst. - Patches 2-4 clean up related and adjacent options in Kconfig. Not actual dependencies, just things I noticed during development. - Patch 5 adds meminfo and vmstat coverage for zswap consumption and activity. - Patch 6 implements per-cgroup tracking & control of zswap memory. This patch (of 6): Add new entries. Minor corrections and cleanups. [hannes@cmpxchg.org: fix htmldocs warnings] Link: https://lkml.kernel.org/r/Ynve8dg4zJyhH2gW@cmpxchg.org [hannes@cmpxchg.org: change `Unevictable' wording, per David] Link: https://lkml.kernel.org/r/YnwFraZlVWQoCjz3@cmpxchg.org Link: https://lkml.kernel.org/r/20220510152847.230957-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20220510152847.230957-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: David Hildenbrand Cc: Dan Streetman Cc: Michal Hocko Cc: Minchan Kim Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 148 +++++++++++++++++++++++-------------- 1 file changed, 92 insertions(+), 56 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 061744c436d9..5e9791457876 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -942,56 +942,71 @@ can be substantial. In many cases there are other means to find out additional memory using subsystem specific interfaces, for instance /proc/net/sockstat for TCP memory allocations. -The following is from a 16GB PIII, which has highmem enabled. -You may not have all of these fields. +Example output. You may not have all of these fields. :: > cat /proc/meminfo - MemTotal: 16344972 kB - MemFree: 13634064 kB - MemAvailable: 14836172 kB - Buffers: 3656 kB - Cached: 1195708 kB - SwapCached: 0 kB - Active: 891636 kB - Inactive: 1077224 kB - HighTotal: 15597528 kB - HighFree: 13629632 kB - LowTotal: 747444 kB - LowFree: 4432 kB - SwapTotal: 0 kB - SwapFree: 0 kB - Dirty: 968 kB - Writeback: 0 kB - AnonPages: 861800 kB - Mapped: 280372 kB - Shmem: 644 kB - KReclaimable: 168048 kB - Slab: 284364 kB - SReclaimable: 159856 kB - SUnreclaim: 124508 kB - PageTables: 24448 kB - NFS_Unstable: 0 kB - Bounce: 0 kB - WritebackTmp: 0 kB - CommitLimit: 7669796 kB - Committed_AS: 100056 kB - VmallocTotal: 112216 kB - VmallocUsed: 428 kB - VmallocChunk: 111088 kB - Percpu: 62080 kB - HardwareCorrupted: 0 kB - AnonHugePages: 49152 kB - ShmemHugePages: 0 kB - ShmemPmdMapped: 0 kB + MemTotal: 32858820 kB + MemFree: 21001236 kB + MemAvailable: 27214312 kB + Buffers: 581092 kB + Cached: 5587612 kB + SwapCached: 0 kB + Active: 3237152 kB + Inactive: 7586256 kB + Active(anon): 94064 kB + Inactive(anon): 4570616 kB + Active(file): 3143088 kB + Inactive(file): 3015640 kB + Unevictable: 0 kB + Mlocked: 0 kB + SwapTotal: 0 kB + SwapFree: 0 kB + Dirty: 12 kB + Writeback: 0 kB + AnonPages: 4654780 kB + Mapped: 266244 kB + Shmem: 9976 kB + KReclaimable: 517708 kB + Slab: 660044 kB + SReclaimable: 517708 kB + SUnreclaim: 142336 kB + KernelStack: 11168 kB + PageTables: 20540 kB + NFS_Unstable: 0 kB + Bounce: 0 kB + WritebackTmp: 0 kB + CommitLimit: 16429408 kB + Committed_AS: 7715148 kB + VmallocTotal: 34359738367 kB + VmallocUsed: 40444 kB + VmallocChunk: 0 kB + Percpu: 29312 kB + HardwareCorrupted: 0 kB + AnonHugePages: 4149248 kB + ShmemHugePages: 0 kB + ShmemPmdMapped: 0 kB + FileHugePages: 0 kB + FilePmdMapped: 0 kB + CmaTotal: 0 kB + CmaFree: 0 kB + HugePages_Total: 0 + HugePages_Free: 0 + HugePages_Rsvd: 0 + HugePages_Surp: 0 + Hugepagesize: 2048 kB + Hugetlb: 0 kB + DirectMap4k: 401152 kB + DirectMap2M: 10008576 kB + DirectMap1G: 24117248 kB MemTotal Total usable RAM (i.e. physical RAM minus a few reserved bits and the kernel binary code) MemFree - The sum of LowFree+HighFree + Total free RAM. On highmem systems, the sum of LowFree+HighFree MemAvailable An estimate of how much memory is available for starting new applications, without swapping. Calculated from MemFree, @@ -1005,8 +1020,9 @@ Buffers Relatively temporary storage for raw disk blocks shouldn't get tremendously large (20MB or so) Cached - in-memory cache for files read from the disk (the - pagecache). Doesn't include SwapCached + In-memory cache for files read from the disk (the + pagecache) as well as tmpfs & shmem. + Doesn't include SwapCached. SwapCached Memory that once was swapped out, is swapped back in but still also is in the swapfile (if memory is needed it @@ -1018,6 +1034,11 @@ Active Inactive Memory which has been less recently used. It is more eligible to be reclaimed for other purposes +Unevictable + Memory allocated for userspace which cannot be reclaimed, such + as mlocked pages, ramfs backing pages, secret memfd pages etc. +Mlocked + Memory locked with mlock(). HighTotal, HighFree Highmem is all memory above ~860MB of physical memory. Highmem areas are for use by userspace programs, or @@ -1040,20 +1061,10 @@ Writeback Memory which is actively being written back to the disk AnonPages Non-file backed pages mapped into userspace page tables -HardwareCorrupted - The amount of RAM/memory in KB, the kernel identifies as - corrupted. -AnonHugePages - Non-file backed huge pages mapped into userspace page tables Mapped files which have been mmaped, such as libraries Shmem Total memory used by shared memory (shmem) and tmpfs -ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated - with huge pages -ShmemPmdMapped - Shared memory mapped into userspace with huge pages KReclaimable Kernel allocations that the kernel will attempt to reclaim under memory pressure. Includes SReclaimable (below), and other @@ -1064,9 +1075,10 @@ SReclaimable Part of Slab, that might be reclaimed, such as caches SUnreclaim Part of Slab, that cannot be reclaimed on memory pressure +KernelStack + Memory consumed by the kernel stacks of all tasks PageTables - amount of memory dedicated to the lowest level of page - tables. + Memory consumed by userspace page tables NFS_Unstable Always zero. Previous counted pages which had been written to the server, but has not been committed to stable storage. @@ -1098,7 +1110,7 @@ Committed_AS has been allocated by processes, even if it has not been "used" by them as of yet. A process which malloc()'s 1G of memory, but only touches 300M of it will show up as - using 1G. This 1G is memory which has been "committed" to + using 1G. This 1G is memory which has been "committed" to by the VM and can be used at any time by the allocating application. With strict overcommit enabled on the system (mode 2 in 'vm.overcommit_memory'), allocations which would @@ -1107,7 +1119,7 @@ Committed_AS not fail due to lack of memory once that memory has been successfully allocated. VmallocTotal - total size of vmalloc memory area + total size of vmalloc virtual address space VmallocUsed amount of vmalloc area which is used VmallocChunk @@ -1115,6 +1127,30 @@ VmallocChunk Percpu Memory allocated to the percpu allocator used to back percpu allocations. This stat excludes the cost of metadata. +HardwareCorrupted + The amount of RAM/memory in KB, the kernel identifies as + corrupted. +AnonHugePages + Non-file backed huge pages mapped into userspace page tables +ShmemHugePages + Memory used by shared memory (shmem) and tmpfs allocated + with huge pages +ShmemPmdMapped + Shared memory mapped into userspace with huge pages +FileHugePages + Memory used for filesystem data (page cache) allocated + with huge pages +FilePmdMapped + Page cache mapped into userspace with huge pages +CmaTotal + Memory reserved for the Contiguous Memory Allocator (CMA) +CmaFree + Free remaining memory in the CMA reserves +HugePages_Total, HugePages_Free, HugePages_Rsvd, HugePages_Surp, Hugepagesize, Hugetlb + See Documentation/admin-guide/mm/hugetlbpage.rst. +DirectMap4k, DirectMap2M, DirectMap1G + Breakdown of page table sizes used in the kernel's + identity mapping of RAM vmallocinfo ~~~~~~~~~~~ -- cgit v1.2.3 From 7b42f1041c98f5d7da74da8d1653a4b2c380e49d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: mm: Kconfig: move swap and slab config options to the MM section These are currently under General Setup. MM seems like a better fit. Link: https://lkml.kernel.org/r/20220510152847.230957-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Dan Streetman Cc: Michal Hocko Cc: Minchan Kim Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Signed-off-by: Andrew Morton --- init/Kconfig | 123 ----------------------------------------------------------- mm/Kconfig | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 123 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index b062cd745de6..1d2ecd227f4b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -352,23 +352,6 @@ config DEFAULT_HOSTNAME but you may wish to use a different default here to make a minimal system more usable with less configuration. -# -# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can -# add proper SWAP support to them, in which case this can be remove. -# -config ARCH_NO_SWAP - bool - -config SWAP - bool "Support for paging of anonymous memory (swap)" - depends on MMU && BLOCK && !ARCH_NO_SWAP - default y - help - This option allows you to choose whether you want to have support - for so called swap devices or swap files in your kernel that are - used to provide more virtual memory than the actual RAM present - in your computer. If unsure say Y. - config SYSVIPC bool "System V IPC" help @@ -1876,112 +1859,6 @@ config COMPAT_BRK On non-ancient distros (post-2000 ones) N is usually a safe choice. -choice - prompt "Choose SLAB allocator" - default SLUB - help - This option allows to select a slab allocator. - -config SLAB - bool "SLAB" - depends on !PREEMPT_RT - select HAVE_HARDENED_USERCOPY_ALLOCATOR - help - The regular slab allocator that is established and known to work - well in all environments. It organizes cache hot objects in - per cpu and per node queues. - -config SLUB - bool "SLUB (Unqueued Allocator)" - select HAVE_HARDENED_USERCOPY_ALLOCATOR - help - SLUB is a slab allocator that minimizes cache line usage - instead of managing queues of cached objects (SLAB approach). - Per cpu caching is realized using slabs of objects instead - of queues of objects. SLUB can use memory efficiently - and has enhanced diagnostics. SLUB is the default choice for - a slab allocator. - -config SLOB - depends on EXPERT - bool "SLOB (Simple Allocator)" - depends on !PREEMPT_RT - help - SLOB replaces the stock allocator with a drastically simpler - allocator. SLOB is generally more space efficient but - does not perform as well on large systems. - -endchoice - -config SLAB_MERGE_DEFAULT - bool "Allow slab caches to be merged" - default y - depends on SLAB || SLUB - help - For reduced kernel memory fragmentation, slab caches can be - merged when they share the same size and other characteristics. - This carries a risk of kernel heap overflows being able to - overwrite objects from merged caches (and more easily control - cache layout), which makes such heap attacks easier to exploit - by attackers. By keeping caches unmerged, these kinds of exploits - can usually only damage objects in the same cache. To disable - merging at runtime, "slab_nomerge" can be passed on the kernel - command line. - -config SLAB_FREELIST_RANDOM - bool "Randomize slab freelist" - depends on SLAB || SLUB - help - Randomizes the freelist order used on creating new pages. This - security feature reduces the predictability of the kernel slab - allocator against heap overflows. - -config SLAB_FREELIST_HARDENED - bool "Harden slab freelist metadata" - depends on SLAB || SLUB - help - Many kernel heap attacks try to target slab cache metadata and - other infrastructure. This options makes minor performance - sacrifices to harden the kernel slab allocator against common - freelist exploit methods. Some slab implementations have more - sanity-checking than others. This option is most effective with - CONFIG_SLUB. - -config SHUFFLE_PAGE_ALLOCATOR - bool "Page allocator randomization" - default SLAB_FREELIST_RANDOM && ACPI_NUMA - help - Randomization of the page allocator improves the average - utilization of a direct-mapped memory-side-cache. See section - 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI - 6.2a specification for an example of how a platform advertises - the presence of a memory-side-cache. There are also incidental - security benefits as it reduces the predictability of page - allocations to compliment SLAB_FREELIST_RANDOM, but the - default granularity of shuffling on the "MAX_ORDER - 1" i.e, - 10th order of pages is selected based on cache utilization - benefits on x86. - - While the randomization improves cache utilization it may - negatively impact workloads on platforms without a cache. For - this reason, by default, the randomization is enabled only - after runtime detection of a direct-mapped memory-side-cache. - Otherwise, the randomization may be force enabled with the - 'page_alloc.shuffle' kernel command line parameter. - - Say Y if unsure. - -config SLUB_CPU_PARTIAL - default y - depends on SLUB && SMP - bool "SLUB per cpu partial cache" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - config MMAP_ALLOW_UNINITIALIZED bool "Allow mmapped anonymous memory to be uninitialized" depends on EXPERT && !MMU diff --git a/mm/Kconfig b/mm/Kconfig index c2141dd639e3..675a6be43739 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -2,6 +2,129 @@ menu "Memory Management options" +# +# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can +# add proper SWAP support to them, in which case this can be remove. +# +config ARCH_NO_SWAP + bool + +config SWAP + bool "Support for paging of anonymous memory (swap)" + depends on MMU && BLOCK && !ARCH_NO_SWAP + default y + help + This option allows you to choose whether you want to have support + for so called swap devices or swap files in your kernel that are + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + +choice + prompt "Choose SLAB allocator" + default SLUB + help + This option allows to select a slab allocator. + +config SLAB + bool "SLAB" + depends on !PREEMPT_RT + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work + well in all environments. It organizes cache hot objects in + per cpu and per node queues. + +config SLUB + bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). + Per cpu caching is realized using slabs of objects instead + of queues of objects. SLUB can use memory efficiently + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. + +config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" + depends on !PREEMPT_RT + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. + +endchoice + +config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" + default y + depends on SLAB || SLUB + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. + This carries a risk of kernel heap overflows being able to + overwrite objects from merged caches (and more easily control + cache layout), which makes such heap attacks easier to exploit + by attackers. By keeping caches unmerged, these kinds of exploits + can usually only damage objects in the same cache. To disable + merging at runtime, "slab_nomerge" can be passed on the kernel + command line. + +config SLAB_FREELIST_RANDOM + bool "Randomize slab freelist" + depends on SLAB || SLUB + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + +config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLAB || SLUB + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifices to harden the kernel slab allocator against common + freelist exploit methods. Some slab implementations have more + sanity-checking than others. This option is most effective with + CONFIG_SLUB. + +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + +config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accelerate objects allocation and freeing + that is local to a processor at the price of more indeterminism + in the latency of the free. On overflow these caches will be cleared + which requires the taking of locks that may cause latency spikes. + Typically one would choose no for a realtime system. + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL -- cgit v1.2.3 From 519bcb797907dd0d1db4e71adb6f610aee80941e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: mm: Kconfig: group swap, slab, hotplug and thp options into submenus There are several clusters of related config options spread throughout the mostly flat MM submenu. Group them together and put specialization options into further subdirectories to make the MM submenu a bit more organized and easier to navigate. [hannes@cmpxchg.org: fix kbuild warnings] Link: https://lkml.kernel.org/r/YnvkSVivfnT57Vwh@cmpxchg.org [hannes@cmpxchg.org: fix more kbuild warnings] Link: https://lkml.kernel.org/r/Ynz8NusTdEGcCnJN@cmpxchg.org Link: https://lkml.kernel.org/r/20220510152847.230957-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Dan Streetman Cc: Michal Hocko Cc: Minchan Kim Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/Kconfig | 447 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 230 insertions(+), 217 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 675a6be43739..f63d8dc36511 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,7 +9,7 @@ menu "Memory Management options" config ARCH_NO_SWAP bool -config SWAP +menuconfig SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK && !ARCH_NO_SWAP default y @@ -19,6 +19,191 @@ config SWAP used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +config ZSWAP + bool "Compressed cache for swap pages (EXPERIMENTAL)" + depends on SWAP && CRYPTO=y + select FRONTSWAP + select ZPOOL + help + A lightweight compressed cache for swap pages. It takes + pages that are in the process of being swapped out and attempts to + compress them into a dynamically allocated RAM-based memory pool. + This can result in a significant I/O reduction on swap device and, + in the case where decompressing from RAM is faster that swap device + reads, can also improve workload performance. + + This is marked experimental because it is a new feature (as of + v3.11) that interacts heavily with memory reclaim. While these + interactions don't cause any known issues on simple memory setups, + they have not be fully explored on the large set of potential + configurations and workloads that exist. + +choice + prompt "Compressed cache for swap pages default compressor" + depends on ZSWAP + default ZSWAP_COMPRESSOR_DEFAULT_LZO + help + Selects the default compression algorithm for the compressed cache + for swap pages. + + For an overview what kind of performance can be expected from + a particular compression algorithm please refer to the benchmarks + available at the following LWN page: + https://lwn.net/Articles/751795/ + + If in doubt, select 'LZO'. + + The selection made here can be overridden by using the kernel + command line 'zswap.compressor=' option. + +config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + bool "Deflate" + select CRYPTO_DEFLATE + help + Use the Deflate algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZO + bool "LZO" + select CRYPTO_LZO + help + Use the LZO algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_842 + bool "842" + select CRYPTO_842 + help + Use the 842 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4 + bool "LZ4" + select CRYPTO_LZ4 + help + Use the LZ4 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + bool "LZ4HC" + select CRYPTO_LZ4HC + help + Use the LZ4HC algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_ZSTD + bool "zstd" + select CRYPTO_ZSTD + help + Use the zstd algorithm as the default compression algorithm. +endchoice + +config ZSWAP_COMPRESSOR_DEFAULT + string + depends on ZSWAP + default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO + default "842" if ZSWAP_COMPRESSOR_DEFAULT_842 + default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4 + default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD + default "" + +choice + prompt "Compressed cache for swap pages default allocator" + depends on ZSWAP + default ZSWAP_ZPOOL_DEFAULT_ZBUD + help + Selects the default allocator for the compressed cache for + swap pages. + The default is 'zbud' for compatibility, however please do + read the description of each of the allocators below before + making a right choice. + + The selection made here can be overridden by using the kernel + command line 'zswap.zpool=' option. + +config ZSWAP_ZPOOL_DEFAULT_ZBUD + bool "zbud" + select ZBUD + help + Use the zbud allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_Z3FOLD + bool "z3fold" + select Z3FOLD + help + Use the z3fold allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + bool "zsmalloc" + select ZSMALLOC + help + Use the zsmalloc allocator as the default allocator. +endchoice + +config ZSWAP_ZPOOL_DEFAULT + string + depends on ZSWAP + default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD + default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD + default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + default "" + +config ZSWAP_DEFAULT_ON + bool "Enable the compressed cache for swap pages by default" + depends on ZSWAP + help + If selected, the compressed cache for swap pages will be enabled + at boot, otherwise it will be disabled. + + The selection made here can be overridden by using the kernel + command line 'zswap.enabled=' option. + +config ZPOOL + tristate "Common API for compressed memory storage" + depends on ZSWAP + help + Compressed memory storage API. This allows using either zbud or + zsmalloc. + +config ZBUD + tristate "Low (Up to 2x) density storage for compressed pages" + depends on ZPOOL + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + +config Z3FOLD + tristate "Up to 3x density storage for compressed pages" + depends on ZPOOL + help + A special purpose allocator for storing compressed pages. + It is designed to store up to three compressed pages per physical + page. It is a ZBUD derivative so the simplicity and determinism are + still there. + +config ZSMALLOC + tristate "Memory allocator for compressed pages" + depends on MMU + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. + +config ZSMALLOC_STAT + bool "Export zsmalloc statistics" + depends on ZSMALLOC + select DEBUG_FS + help + This option enables code in the zsmalloc to collect various + statistics about what's happening in zsmalloc and exports that + information to userspace via debugfs. + If unsure, say N. + +menu "SLAB allocator options" + choice prompt "Choose SLAB allocator" default SLUB @@ -90,6 +275,19 @@ config SLAB_FREELIST_HARDENED sanity-checking than others. This option is most effective with CONFIG_SLUB. +config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accelerate objects allocation and freeing + that is local to a processor at the price of more indeterminism + in the latency of the free. On overflow these caches will be cleared + which requires the taking of locks that may cause latency spikes. + Typically one would choose no for a realtime system. + +endmenu # SLAB allocator options + config SHUFFLE_PAGE_ALLOCATOR bool "Page allocator randomization" default SLAB_FREELIST_RANDOM && ACPI_NUMA @@ -114,17 +312,6 @@ config SHUFFLE_PAGE_ALLOCATOR Say Y if unsure. -config SLUB_CPU_PARTIAL - default y - depends on SLUB && SMP - bool "SLUB per cpu partial cache" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -249,15 +436,20 @@ config HAVE_BOOTMEM_INFO_NODE config ARCH_ENABLE_MEMORY_HOTPLUG bool +config ARCH_ENABLE_MEMORY_HOTREMOVE + bool + # eventually, we can have this option just 'select SPARSEMEM' -config MEMORY_HOTPLUG - bool "Allow for memory hot-add" +menuconfig MEMORY_HOTPLUG + bool "Memory hotplug" select MEMORY_ISOLATION depends on SPARSEMEM depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on 64BIT select NUMA_KEEP_MEMINFO if NUMA +if MEMORY_HOTPLUG + config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" depends on MEMORY_HOTPLUG @@ -273,9 +465,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE Say N here if you want the default policy to keep all hot-plugged memory blocks in 'offline' state. -config ARCH_ENABLE_MEMORY_HOTREMOVE - bool - config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) @@ -287,6 +476,8 @@ config MHP_MEMMAP_ON_MEMORY depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE +endif # MEMORY_HOTPLUG + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. @@ -501,7 +692,13 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/admin-guide/mm/nommu-mmap.rst for more information. -config TRANSPARENT_HUGEPAGE +config ARCH_WANT_GENERAL_HUGETLB + bool + +config ARCH_WANTS_THP_SWAP + def_bool n + +menuconfig TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION @@ -516,6 +713,8 @@ config TRANSPARENT_HUGEPAGE If memory constrained on embedded, you may want to say N. +if TRANSPARENT_HUGEPAGE + choice prompt "Transparent Hugepage Support sysfs defaults" depends on TRANSPARENT_HUGEPAGE @@ -540,12 +739,6 @@ choice benefit. endchoice -config ARCH_WANT_GENERAL_HUGETLB - bool - -config ARCH_WANTS_THP_SWAP - def_bool n - config THP_SWAP def_bool y depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP @@ -556,6 +749,19 @@ config THP_SWAP For selection by architectures with reasonable THP sizes. +config READ_ONLY_THP_FOR_FS + bool "Read-only THP for filesystems (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE && SHMEM + + help + Allow khugepaged to put read-only file-backed pages in THP. + + This is marked experimental because it is a new feature. Write + support of file THPs will be developed in the next few release + cycles. + +endif # TRANSPARENT_HUGEPAGE + # # UP and nommu archs use km based percpu allocator # @@ -640,188 +846,6 @@ config MEM_SOFT_DIRTY See Documentation/admin-guide/mm/soft-dirty.rst for more details. -config ZSWAP - bool "Compressed cache for swap pages (EXPERIMENTAL)" - depends on SWAP && CRYPTO=y - select FRONTSWAP - select ZPOOL - help - A lightweight compressed cache for swap pages. It takes - pages that are in the process of being swapped out and attempts to - compress them into a dynamically allocated RAM-based memory pool. - This can result in a significant I/O reduction on swap device and, - in the case where decompressing from RAM is faster that swap device - reads, can also improve workload performance. - - This is marked experimental because it is a new feature (as of - v3.11) that interacts heavily with memory reclaim. While these - interactions don't cause any known issues on simple memory setups, - they have not be fully explored on the large set of potential - configurations and workloads that exist. - -choice - prompt "Compressed cache for swap pages default compressor" - depends on ZSWAP - default ZSWAP_COMPRESSOR_DEFAULT_LZO - help - Selects the default compression algorithm for the compressed cache - for swap pages. - - For an overview what kind of performance can be expected from - a particular compression algorithm please refer to the benchmarks - available at the following LWN page: - https://lwn.net/Articles/751795/ - - If in doubt, select 'LZO'. - - The selection made here can be overridden by using the kernel - command line 'zswap.compressor=' option. - -config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE - bool "Deflate" - select CRYPTO_DEFLATE - help - Use the Deflate algorithm as the default compression algorithm. - -config ZSWAP_COMPRESSOR_DEFAULT_LZO - bool "LZO" - select CRYPTO_LZO - help - Use the LZO algorithm as the default compression algorithm. - -config ZSWAP_COMPRESSOR_DEFAULT_842 - bool "842" - select CRYPTO_842 - help - Use the 842 algorithm as the default compression algorithm. - -config ZSWAP_COMPRESSOR_DEFAULT_LZ4 - bool "LZ4" - select CRYPTO_LZ4 - help - Use the LZ4 algorithm as the default compression algorithm. - -config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC - bool "LZ4HC" - select CRYPTO_LZ4HC - help - Use the LZ4HC algorithm as the default compression algorithm. - -config ZSWAP_COMPRESSOR_DEFAULT_ZSTD - bool "zstd" - select CRYPTO_ZSTD - help - Use the zstd algorithm as the default compression algorithm. -endchoice - -config ZSWAP_COMPRESSOR_DEFAULT - string - depends on ZSWAP - default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE - default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO - default "842" if ZSWAP_COMPRESSOR_DEFAULT_842 - default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4 - default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC - default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD - default "" - -choice - prompt "Compressed cache for swap pages default allocator" - depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZBUD - help - Selects the default allocator for the compressed cache for - swap pages. - The default is 'zbud' for compatibility, however please do - read the description of each of the allocators below before - making a right choice. - - The selection made here can be overridden by using the kernel - command line 'zswap.zpool=' option. - -config ZSWAP_ZPOOL_DEFAULT_ZBUD - bool "zbud" - select ZBUD - help - Use the zbud allocator as the default allocator. - -config ZSWAP_ZPOOL_DEFAULT_Z3FOLD - bool "z3fold" - select Z3FOLD - help - Use the z3fold allocator as the default allocator. - -config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - bool "zsmalloc" - select ZSMALLOC - help - Use the zsmalloc allocator as the default allocator. -endchoice - -config ZSWAP_ZPOOL_DEFAULT - string - depends on ZSWAP - default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD - default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD - default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - default "" - -config ZSWAP_DEFAULT_ON - bool "Enable the compressed cache for swap pages by default" - depends on ZSWAP - help - If selected, the compressed cache for swap pages will be enabled - at boot, otherwise it will be disabled. - - The selection made here can be overridden by using the kernel - command line 'zswap.enabled=' option. - -config ZPOOL - tristate "Common API for compressed memory storage" - help - Compressed memory storage API. This allows using either zbud or - zsmalloc. - -config ZBUD - tristate "Low (Up to 2x) density storage for compressed pages" - depends on ZPOOL - help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. - -config Z3FOLD - tristate "Up to 3x density storage for compressed pages" - depends on ZPOOL - help - A special purpose allocator for storing compressed pages. - It is designed to store up to three compressed pages per physical - page. It is a ZBUD derivative so the simplicity and determinism are - still there. - -config ZSMALLOC - tristate "Memory allocator for compressed pages" - depends on MMU - help - zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. - -config ZSMALLOC_STAT - bool "Export zsmalloc statistics" - depends on ZSMALLOC - select DEBUG_FS - help - This option enables code in the zsmalloc to collect various - statistics about what's happening in zsmalloc and exports that - information to userspace via debugfs. - If unsure, say N. - config GENERIC_EARLY_IOREMAP bool @@ -978,17 +1002,6 @@ comment "GUP_TEST needs to have DEBUG_FS enabled" config GUP_GET_PTE_LOW_HIGH bool -config READ_ONLY_THP_FOR_FS - bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGEPAGE && SHMEM - - help - Allow khugepaged to put read-only file-backed pages in THP. - - This is marked experimental because it is a new feature. Write - support of file THPs will be developed in the next few release - cycles. - config ARCH_HAS_PTE_SPECIAL bool -- cgit v1.2.3 From b3fbd58fcbb10725a1314688e03b1af6827c42f9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: mm: Kconfig: simplify zswap configuration - CONFIG_ZRAM: Zram is a user-facing feature, whereas zsmalloc is not. Don't make the user chase down a technical dependency like that, just select it in automatically when zram is requested. The CONFIG_CRYPTO dependency is redundant due to more specific deps. - CONFIG_ZPOOL: This is not a user-facing feature. Hide the symbol and have it selected in as needed. - CONFIG_ZSWAP: Select CRYPTO instead of depend. Common pattern. - Make the ZSWAP suboptions and their descriptions (compression, allocation backend) a bit more straight-forward for the user. Link: https://lkml.kernel.org/r/20220510152847.230957-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Dan Streetman Cc: Michal Hocko Cc: Minchan Kim Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Signed-off-by: Andrew Morton --- drivers/block/zram/Kconfig | 3 ++- mm/Kconfig | 55 +++++++++++++++++++++------------------------- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 668c6bf2554d..e4163d4b936b 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 config ZRAM tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO + depends on BLOCK && SYSFS depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842 + select ZSMALLOC help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). Pages written to these disks are compressed and stored in memory diff --git a/mm/Kconfig b/mm/Kconfig index f63d8dc36511..905c205e14f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,6 +9,9 @@ menu "Memory Management options" config ARCH_NO_SWAP bool +config ZPOOL + bool + menuconfig SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK && !ARCH_NO_SWAP @@ -21,8 +24,9 @@ menuconfig SWAP config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" - depends on SWAP && CRYPTO=y + depends on SWAP select FRONTSWAP + select CRYPTO select ZPOOL help A lightweight compressed cache for swap pages. It takes @@ -38,8 +42,18 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. +config ZSWAP_DEFAULT_ON + bool "Enable the compressed cache for swap pages by default" + depends on ZSWAP + help + If selected, the compressed cache for swap pages will be enabled + at boot, otherwise it will be disabled. + + The selection made here can be overridden by using the kernel + command line 'zswap.enabled=' option. + choice - prompt "Compressed cache for swap pages default compressor" + prompt "Default compressor" depends on ZSWAP default ZSWAP_COMPRESSOR_DEFAULT_LZO help @@ -105,7 +119,7 @@ config ZSWAP_COMPRESSOR_DEFAULT default "" choice - prompt "Compressed cache for swap pages default allocator" + prompt "Default allocator" depends on ZSWAP default ZSWAP_ZPOOL_DEFAULT_ZBUD help @@ -145,26 +159,9 @@ config ZSWAP_ZPOOL_DEFAULT default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC default "" -config ZSWAP_DEFAULT_ON - bool "Enable the compressed cache for swap pages by default" - depends on ZSWAP - help - If selected, the compressed cache for swap pages will be enabled - at boot, otherwise it will be disabled. - - The selection made here can be overridden by using the kernel - command line 'zswap.enabled=' option. - -config ZPOOL - tristate "Common API for compressed memory storage" - depends on ZSWAP - help - Compressed memory storage API. This allows using either zbud or - zsmalloc. - config ZBUD - tristate "Low (Up to 2x) density storage for compressed pages" - depends on ZPOOL + tristate "2:1 compression allocator (zbud)" + depends on ZSWAP help A special purpose allocator for storing compressed pages. It is designed to store up to two compressed pages per physical @@ -173,8 +170,8 @@ config ZBUD density approach when reclaim will be used. config Z3FOLD - tristate "Up to 3x density storage for compressed pages" - depends on ZPOOL + tristate "3:1 compression allocator (z3fold)" + depends on ZSWAP help A special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical @@ -182,15 +179,13 @@ config Z3FOLD still there. config ZSMALLOC - tristate "Memory allocator for compressed pages" + tristate + prompt "N:1 compression allocator (zsmalloc)" if ZSWAP depends on MMU help zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. + pages of various compression levels efficiently. It achieves + the highest storage density with the least amount of fragmentation. config ZSMALLOC_STAT bool "Export zsmalloc statistics" -- cgit v1.2.3 From f6498b776d280b30a4614d8261840961e993c2c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: mm: zswap: add basic meminfo and vmstat coverage Currently it requires poking at debugfs to figure out the size and population of the zswap cache on a host. There are no counters for reads and writes against the cache. As a result, it's difficult to understand zswap behavior on production systems. Print zswap memory consumption and how many pages are zswapped out in /proc/meminfo. Count zswapouts and zswapins in /proc/vmstat. Link: https://lkml.kernel.org/r/20220510152847.230957-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: David Hildenbrand Cc: Dan Streetman Cc: Michal Hocko Cc: Minchan Kim Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 6 ++++++ fs/proc/meminfo.c | 7 +++++++ include/linux/swap.h | 5 +++++ include/linux/vm_event_item.h | 4 ++++ mm/vmstat.c | 4 ++++ mm/zswap.c | 13 ++++++------- 6 files changed, 32 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 5e9791457876..cfec3cab0f9a 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -964,6 +964,8 @@ Example output. You may not have all of these fields. Mlocked: 0 kB SwapTotal: 0 kB SwapFree: 0 kB + Zswap: 1904 kB + Zswapped: 7792 kB Dirty: 12 kB Writeback: 0 kB AnonPages: 4654780 kB @@ -1055,6 +1057,10 @@ SwapTotal SwapFree Memory which has been evicted from RAM, and is temporarily on the disk +Zswap + Memory consumed by the zswap backend (compressed size) +Zswapped + Amount of anonymous memory stored in zswap (original size) Dirty Memory which is waiting to get written back to the disk Writeback diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6fa761c9cc78..6e89f0e2fd20 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); +#ifdef CONFIG_ZSWAP + seq_printf(m, "Zswap: %8lu kB\n", + (unsigned long)(zswap_pool_total_size >> 10)); + seq_printf(m, "Zswapped: %8lu kB\n", + (unsigned long)atomic_read(&zswap_stored_pages) << + (PAGE_SHIFT - 10)); +#endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", diff --git a/include/linux/swap.h b/include/linux/swap.h index 04c0713c1d6e..f3ae17b43f20 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -620,6 +620,11 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif +#ifdef CONFIG_ZSWAP +extern u64 zswap_pool_total_size; +extern atomic_t zswap_stored_pages; +#endif + #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e83967e4c20e..404024486fa5 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -136,6 +136,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_KSM COW_KSM, #endif +#ifdef CONFIG_ZSWAP + ZSWPIN, + ZSWPOUT, +#endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, diff --git a/mm/vmstat.c b/mm/vmstat.c index b94a2e4723ff..da525bfb6f4a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1396,6 +1396,10 @@ const char * const vmstat_text[] = { #ifdef CONFIG_KSM "cow_ksm", #endif +#ifdef CONFIG_ZSWAP + "zswpin", + "zswpout", +#endif #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", diff --git a/mm/zswap.c b/mm/zswap.c index 2c5db4cbedea..e3c16a70f533 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -42,9 +42,9 @@ * statistics **********************************/ /* Total bytes used by the compressed storage */ -static u64 zswap_pool_total_size; +u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ -static atomic_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); @@ -1243,6 +1243,7 @@ insert_entry: /* update stats */ atomic_inc(&zswap_stored_pages); zswap_update_total_size(); + count_vm_event(ZSWPOUT); return 0; @@ -1285,11 +1286,10 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zswap_fill_page(dst, entry->value); kunmap_atomic(dst); ret = 0; - goto freeentry; + goto stats; } if (!zpool_can_sleep_mapped(entry->pool->zpool)) { - tmp = kmalloc(entry->length, GFP_ATOMIC); if (!tmp) { ret = -ENOMEM; @@ -1304,10 +1304,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, src += sizeof(struct zswap_header); if (!zpool_can_sleep_mapped(entry->pool->zpool)) { - memcpy(tmp, src, entry->length); src = tmp; - zpool_unmap_handle(entry->pool->zpool, entry->handle); } @@ -1326,7 +1324,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, kfree(tmp); BUG_ON(ret); - +stats: + count_vm_event(ZSWPIN); freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); -- cgit v1.2.3 From f4840ccfca25db225b3371a8f7b5770febee87c5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: zswap: memcg accounting Applications can currently escape their cgroup memory containment when zswap is enabled. This patch adds per-cgroup tracking and limiting of zswap backend memory to rectify this. The existing cgroup2 memory.stat file is extended to show zswap statistics analogous to what's in meminfo and vmstat. Furthermore, two new control files, memory.zswap.current and memory.zswap.max, are added to allow tuning zswap usage on a per-workload basis. This is important since not all workloads benefit from zswap equally; some even suffer compared to disk swap when memory contents don't compress well. The optimal size of the zswap pool, and the threshold for writeback, also depends on the size of the workload's warm set. The implementation doesn't use a traditional page_counter transaction. zswap is unconventional as a memory consumer in that we only know the amount of memory to charge once expensive compression has occurred. If zwap is disabled or the limit is already exceeded we obviously don't want to compress page upon page only to reject them all. Instead, the limit is checked against current usage, then we compress and charge. This allows some limit overrun, but not enough to matter in practice. [hannes@cmpxchg.org: fix for CONFIG_SLOB builds] Link: https://lkml.kernel.org/r/YnwD14zxYjUJPc2w@cmpxchg.org [hannes@cmpxchg.org: opt out of cgroups v1] Link: https://lkml.kernel.org/r/Yn6it9mBYFA+/lTb@cmpxchg.org Link: https://lkml.kernel.org/r/20220510152847.230957-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Seth Jennings Cc: Dan Streetman Cc: Minchan Kim Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 21 ++++ include/linux/memcontrol.h | 54 +++++++++ mm/memcontrol.c | 205 ++++++++++++++++++++++++++++++-- mm/zswap.c | 37 ++++-- 4 files changed, 302 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 07ef3bf1448e..08ae8189e6cb 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1354,6 +1354,12 @@ PAGE_SIZE multiple when read back. Amount of cached filesystem data that is swap-backed, such as tmpfs, shm segments, shared anonymous mmap()s + zswap + Amount of memory consumed by the zswap compression backend. + + zswapped + Amount of application memory swapped out to zswap. + file_mapped Amount of cached filesystem data mapped with mmap() @@ -1544,6 +1550,21 @@ PAGE_SIZE multiple when read back. higher than the limit for an extended period of time. This reduces the impact on the workload and memory management. + memory.zswap.current + A read-only single value file which exists on non-root + cgroups. + + The total amount of memory consumed by the zswap compression + backend. + + memory.zswap.max + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Zswap usage hard limit. If a cgroup's zswap pool reaches this + limit, it will refuse to take any more stores before existing + entries fault back in or are written out to disk. + memory.pressure A read-only nested-keyed file. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8ea4b541c31e..9ecead1042b9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,6 +35,8 @@ enum memcg_stat_item { MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, + MEMCG_ZSWAP_B, + MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; @@ -252,6 +254,10 @@ struct mem_cgroup { /* Range enforcement for interrupt charges */ struct work_struct high_work; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + unsigned long zswap_max; +#endif + unsigned long soft_limit; /* vmpressure notifications */ @@ -1273,6 +1279,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) return NULL; } +static inline void obj_cgroup_put(struct obj_cgroup *objcg) +{ +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } @@ -1694,6 +1704,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); struct obj_cgroup *get_obj_cgroup_from_current(void); +struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); @@ -1730,6 +1741,20 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_obj(void *p); +static inline void count_objcg_event(struct obj_cgroup *objcg, + enum vm_event_item idx) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_kmem_disabled()) + return; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + count_memcg_events(memcg, idx, 1); + rcu_read_unlock(); +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1756,6 +1781,11 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } +static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline bool memcg_kmem_enabled(void) { return false; @@ -1771,6 +1801,30 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } +static inline void count_objcg_event(struct obj_cgroup *objcg, + enum vm_event_item idx) +{ +} + #endif /* CONFIG_MEMCG_KMEM */ +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); +void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); +void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); +#else +static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) +{ + return true; +} +static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, + size_t size) +{ +} +static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, + size_t size) +{ +} +#endif + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ef76df7c6d12..abec50f31fe6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1398,6 +1398,10 @@ static const struct memory_stat memory_stats[] = { { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + { "zswap", MEMCG_ZSWAP_B }, + { "zswapped", MEMCG_ZSWAPPED }, +#endif { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, { "file_writeback", NR_WRITEBACK }, @@ -1432,6 +1436,7 @@ static int memcg_page_state_unit(int item) { switch (item) { case MEMCG_PERCPU_B: + case MEMCG_ZSWAP_B: case NR_SLAB_RECLAIMABLE_B: case NR_SLAB_UNRECLAIMABLE_B: case WORKINGSET_REFAULT_ANON: @@ -1512,6 +1517,13 @@ static char *memory_stat_format(struct mem_cgroup *memcg) seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), memcg_events(memcg, PGLAZYFREED)); +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN), + memcg_events(memcg, ZSWPIN)); + seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT), + memcg_events(memcg, ZSWPOUT)); +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), memcg_events(memcg, THP_FAULT_ALLOC)); @@ -2883,6 +2895,19 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return page_memcg_check(folio_page(folio, 0)); } +static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg = NULL; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + objcg = rcu_dereference(memcg->objcg); + if (objcg && obj_cgroup_tryget(objcg)) + break; + objcg = NULL; + } + return objcg; +} + __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = NULL; @@ -2896,15 +2921,32 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) memcg = active_memcg(); else memcg = mem_cgroup_from_task(current); - - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { - objcg = rcu_dereference(memcg->objcg); - if (objcg && obj_cgroup_tryget(objcg)) - break; - objcg = NULL; - } + objcg = __get_obj_cgroup_from_memcg(memcg); rcu_read_unlock(); + return objcg; +} + +struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_enabled() || memcg_kmem_bypass()) + return NULL; + + if (PageMemcgKmem(page)) { + objcg = __folio_objcg(page_folio(page)); + obj_cgroup_get(objcg); + } else { + struct mem_cgroup *memcg; + rcu_read_lock(); + memcg = __folio_memcg(page_folio(page)); + if (memcg) + objcg = __get_obj_cgroup_from_memcg(memcg); + else + objcg = NULL; + rcu_read_unlock(); + } return objcg; } @@ -5142,6 +5184,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + memcg->zswap_max = PAGE_COUNTER_MAX; +#endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); @@ -7421,6 +7466,148 @@ static struct cftype memsw_files[] = { { }, /* terminate */ }; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +/** + * obj_cgroup_may_zswap - check if this cgroup can zswap + * @objcg: the object cgroup + * + * Check if the hierarchical zswap limit has been reached. + * + * This doesn't check for specific headroom, and it is not atomic + * either. But with zswap, the size of the allocation is only known + * once compression has occured, and this optimistic pre-check avoids + * spending cycles on compression when there is already no room left + * or zswap is disabled altogether somewhere in the hierarchy. + */ +bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) +{ + struct mem_cgroup *memcg, *original_memcg; + bool ret = true; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; + + original_memcg = get_mem_cgroup_from_objcg(objcg); + for (memcg = original_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + unsigned long max = READ_ONCE(memcg->zswap_max); + unsigned long pages; + + if (max == PAGE_COUNTER_MAX) + continue; + if (max == 0) { + ret = false; + break; + } + + cgroup_rstat_flush(memcg->css.cgroup); + pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; + if (pages < max) + continue; + ret = false; + break; + } + mem_cgroup_put(original_memcg); + return ret; +} + +/** + * obj_cgroup_charge_zswap - charge compression backend memory + * @objcg: the object cgroup + * @size: size of compressed object + * + * This forces the charge after obj_cgroup_may_swap() allowed + * compression and storage in zwap for this cgroup to go ahead. + */ +void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) +{ + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); + + /* PF_MEMALLOC context, charging must succeed */ + if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) + VM_WARN_ON_ONCE(1); + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + rcu_read_unlock(); +} + +/** + * obj_cgroup_uncharge_zswap - uncharge compression backend memory + * @objcg: the object cgroup + * @size: size of compressed object + * + * Uncharges zswap memory on page in. + */ +void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) +{ + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + obj_cgroup_uncharge(objcg, size); + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); + mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + rcu_read_unlock(); +} + +static u64 zswap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + cgroup_rstat_flush(css->cgroup); + return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); +} + +static int zswap_max_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); +} + +static ssize_t zswap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + xchg(&memcg->zswap_max, max); + + return nbytes; +} + +static struct cftype zswap_files[] = { + { + .name = "zswap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = zswap_current_read, + }, + { + .name = "zswap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = zswap_max_show, + .write = zswap_max_write, + }, + { } /* terminate */ +}; +#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ + /* * If mem_cgroup_swap_init() is implemented as a subsys_initcall() * instead of a core_initcall(), this could mean cgroup_memory_noswap still @@ -7439,7 +7626,9 @@ static int __init mem_cgroup_swap_init(void) WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); - +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); +#endif return 0; } core_initcall(mem_cgroup_swap_init); diff --git a/mm/zswap.c b/mm/zswap.c index e3c16a70f533..104835b379ec 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -188,6 +188,7 @@ struct zswap_entry { unsigned long handle; unsigned long value; }; + struct obj_cgroup *objcg; }; struct zswap_header { @@ -359,6 +360,10 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { @@ -1096,6 +1101,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; + struct obj_cgroup *objcg = NULL; + struct zswap_pool *pool; int ret; unsigned int hlen, dlen = PAGE_SIZE; unsigned long handle, value; @@ -1115,17 +1122,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto reject; } + objcg = get_obj_cgroup_from_page(page); + if (objcg && !obj_cgroup_may_zswap(objcg)) + goto shrink; + /* reclaim space if needed */ if (zswap_is_full()) { - struct zswap_pool *pool; - zswap_pool_limit_hit++; zswap_pool_reached_full = true; - pool = zswap_pool_last_get(); - if (pool) - queue_work(shrink_wq, &pool->shrink_work); - ret = -ENOMEM; - goto reject; + goto shrink; } if (zswap_pool_reached_full) { @@ -1227,6 +1232,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry->length = dlen; insert_entry: + entry->objcg = objcg; + if (objcg) { + obj_cgroup_charge_zswap(objcg, entry->length); + /* Account before objcg ref is moved to tree */ + count_objcg_event(objcg, ZSWPOUT); + } + /* map */ spin_lock(&tree->lock); do { @@ -1253,7 +1265,16 @@ put_dstmem: freepage: zswap_entry_cache_free(entry); reject: + if (objcg) + obj_cgroup_put(objcg); return ret; + +shrink: + pool = zswap_pool_last_get(); + if (pool) + queue_work(shrink_wq, &pool->shrink_work); + ret = -ENOMEM; + goto reject; } /* @@ -1326,6 +1347,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, BUG_ON(ret); stats: count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPIN); freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); -- cgit v1.2.3 From 6d4675e601357834dadd2ba1d803f6484596015c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 19 May 2022 14:08:54 -0700 Subject: mm: don't be stuck to rmap lock on reclaim path The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. Martin Liu reported: Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file [minchan@kernel.org: add comment, per Matthew] Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: John Dias Cc: Tim Murray Cc: Matthew Wilcox Cc: Vladimir Davydov Cc: Martin Liu Cc: Minchan Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/fs.h | 5 +++++ include/linux/ksm.h | 4 ++-- include/linux/rmap.h | 28 ++++++++++++++++++++-------- mm/ksm.c | 10 ++++++++-- mm/memory-failure.c | 2 +- mm/page_idle.c | 7 ++++--- mm/rmap.c | 52 +++++++++++++++++++++++++++++++++++++++++----------- mm/vmscan.c | 7 ++++++- 8 files changed, 87 insertions(+), 28 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index b81cacc51d2f..044b67f8d861 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -477,6 +477,11 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) up_write(&mapping->i_mmap_rwsem); } +static inline int i_mmap_trylock_read(struct address_space *mapping) +{ + return down_read_trylock(&mapping->i_mmap_rwsem); +} + static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0630e545f4cb..0b4f17418f64 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -79,7 +79,7 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, } static inline void rmap_walk_ksm(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index cbe279a6f0de..9ec23138e410 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -128,6 +128,11 @@ static inline void anon_vma_lock_read(struct anon_vma *anon_vma) down_read(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); @@ -366,17 +371,14 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() + * try_lock: bail out if the rmap lock is contended + * contended: indicate the rmap traversal bailed out due to lock contention * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default @@ -384,6 +386,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; + bool try_lock; + bool contended; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. @@ -391,12 +395,20 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio); + struct anon_vma *(*anon_lock)(struct folio *folio, + struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc); -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); + +/* + * Called by memory-failure.c to kill processes. + */ +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ diff --git a/mm/ksm.c b/mm/ksm.c index 38360285497a..9ee82c9bce94 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2610,7 +2610,7 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; @@ -2634,7 +2634,13 @@ again: struct vm_area_struct *vma; cond_resched(); - anon_vma_lock_read(anon_vma); + if (!anon_vma_trylock_read(anon_vma)) { + if (rwc->try_lock) { + rwc->contended = true; + return; + } + anon_vma_lock_read(anon_vma); + } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 01f8b63d3621..a934ee8124dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -485,7 +485,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = folio_lock_anon_vma_read(folio); + av = folio_lock_anon_vma_read(folio, NULL); if (av == NULL) /* Not actually mapped anymore */ return; diff --git a/mm/page_idle.c b/mm/page_idle.c index fc0435abf909..bc08332a609c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -86,11 +86,12 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, static void page_idle_clear_pte_refs(struct page *page) { struct folio *folio = page_folio(page); + /* - * Since rwc.arg is unused, rwc is effectively immutable, so we - * can make it static const to save some cycles and stack. + * Since rwc.try_lock is unused, rwc is effectively immutable, so we + * can make it static to save some cycles and stack. */ - static const struct rmap_walk_control rwc = { + static struct rmap_walk_control rwc = { .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = folio_lock_anon_vma_read, }; diff --git a/mm/rmap.c b/mm/rmap.c index 219e287a83d2..5bcb334cd6f2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -527,9 +527,11 @@ out: * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex. + * reference like with page_get_anon_vma() and then block on the mutex + * on !rwc->try_lock case. */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -557,6 +559,12 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) goto out; } + if (rwc && rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; @@ -883,7 +891,8 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) * * Quick test_and_clear_referenced for all mappings of a folio, * - * Return: The number of mappings which referenced the folio. + * Return: The number of mappings which referenced the folio. Return -1 if + * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) @@ -897,6 +906,7 @@ int folio_referenced(struct folio *folio, int is_locked, .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, + .try_lock = true, }; *vm_flags = 0; @@ -927,7 +937,7 @@ int folio_referenced(struct folio *folio, int is_locked, if (we_locked) folio_unlock(folio); - return pra.referenced; + return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) @@ -2336,12 +2346,12 @@ void __put_anon_vma(struct anon_vma *anon_vma) } static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(folio); + return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() @@ -2353,7 +2363,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, if (!anon_vma) return NULL; + if (anon_vma_trylock_read(anon_vma)) + goto out; + + if (rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + anon_vma_lock_read(anon_vma); +out: return anon_vma; } @@ -2367,7 +2387,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, * contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, - const struct rmap_walk_control *rwc, bool locked) + struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; @@ -2415,7 +2435,7 @@ static void rmap_walk_anon(struct folio *folio, * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, - const struct rmap_walk_control *rwc, bool locked) + struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; @@ -2434,8 +2454,18 @@ static void rmap_walk_file(struct folio *folio, pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; - if (!locked) + if (!locked) { + if (i_mmap_trylock_read(mapping)) + goto lookup; + + if (rwc->try_lock) { + rwc->contended = true; + return; + } + i_mmap_lock_read(mapping); + } +lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(&folio->page, vma); @@ -2457,7 +2487,7 @@ done: i_mmap_unlock_read(mapping); } -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); @@ -2468,7 +2498,7 @@ void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) } /* Like rmap_walk, but caller holds relevant rmap lock */ -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc) +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); diff --git a/mm/vmscan.c b/mm/vmscan.c index 24dbe04520cb..887edcd93a40 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1391,6 +1391,10 @@ static enum page_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return PAGEREF_ACTIVATE; + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return PAGEREF_KEEP; + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -2499,8 +2503,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } } + /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags) != 0) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- cgit v1.2.3 From 018160ad314d75b1409129b2247b614a9f35894c Mon Sep 17 00:00:00 2001 From: Wang Cheng Date: Thu, 19 May 2022 14:08:54 -0700 Subject: mm/mempolicy: fix uninit-value in mpol_rebind_policy() mpol_set_nodemask()(mm/mempolicy.c) does not set up nodemask when pol->mode is MPOL_LOCAL. Check pol->mode before access pol->w.cpuset_mems_allowed in mpol_rebind_policy()(mm/mempolicy.c). BUG: KMSAN: uninit-value in mpol_rebind_policy mm/mempolicy.c:352 [inline] BUG: KMSAN: uninit-value in mpol_rebind_task+0x2ac/0x2c0 mm/mempolicy.c:368 mpol_rebind_policy mm/mempolicy.c:352 [inline] mpol_rebind_task+0x2ac/0x2c0 mm/mempolicy.c:368 cpuset_change_task_nodemask kernel/cgroup/cpuset.c:1711 [inline] cpuset_attach+0x787/0x15e0 kernel/cgroup/cpuset.c:2278 cgroup_migrate_execute+0x1023/0x1d20 kernel/cgroup/cgroup.c:2515 cgroup_migrate kernel/cgroup/cgroup.c:2771 [inline] cgroup_attach_task+0x540/0x8b0 kernel/cgroup/cgroup.c:2804 __cgroup1_procs_write+0x5cc/0x7a0 kernel/cgroup/cgroup-v1.c:520 cgroup1_tasks_write+0x94/0xb0 kernel/cgroup/cgroup-v1.c:539 cgroup_file_write+0x4c2/0x9e0 kernel/cgroup/cgroup.c:3852 kernfs_fop_write_iter+0x66a/0x9f0 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2162 [inline] new_sync_write fs/read_write.c:503 [inline] vfs_write+0x1318/0x2030 fs/read_write.c:590 ksys_write+0x28b/0x510 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0xdb/0x120 fs/read_write.c:652 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:524 [inline] slab_alloc_node mm/slub.c:3251 [inline] slab_alloc mm/slub.c:3259 [inline] kmem_cache_alloc+0x902/0x11c0 mm/slub.c:3264 mpol_new mm/mempolicy.c:293 [inline] do_set_mempolicy+0x421/0xb70 mm/mempolicy.c:853 kernel_set_mempolicy mm/mempolicy.c:1504 [inline] __do_sys_set_mempolicy mm/mempolicy.c:1510 [inline] __se_sys_set_mempolicy+0x44c/0xb60 mm/mempolicy.c:1507 __x64_sys_set_mempolicy+0xd8/0x110 mm/mempolicy.c:1507 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae KMSAN: uninit-value in mpol_rebind_task (2) https://syzkaller.appspot.com/bug?id=d6eb90f952c2a5de9ea718a1b873c55cb13b59dc This patch seems to fix below bug too. KMSAN: uninit-value in mpol_rebind_mm (2) https://syzkaller.appspot.com/bug?id=f2fecd0d7013f54ec4162f60743a2b28df40926b The uninit-value is pol->w.cpuset_mems_allowed in mpol_rebind_policy(). When syzkaller reproducer runs to the beginning of mpol_new(), mpol_new() mm/mempolicy.c do_mbind() mm/mempolicy.c kernel_mbind() mm/mempolicy.c `mode` is 1(MPOL_PREFERRED), nodes_empty(*nodes) is `true` and `flags` is 0. Then mode = MPOL_LOCAL; ... policy->mode = mode; policy->flags = flags; will be executed. So in mpol_set_nodemask(), mpol_set_nodemask() mm/mempolicy.c do_mbind() kernel_mbind() pol->mode is 4 (MPOL_LOCAL), that `nodemask` in `pol` is not initialized, which will be accessed in mpol_rebind_policy(). Link: https://lkml.kernel.org/r/20220512123428.fq3wofedp6oiotd4@ppc.localdomain Signed-off-by: Wang Cheng Reported-by: Tested-by: Cc: David Rientjes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ed055e34d20c..d39b01fd52fe 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -351,7 +351,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol, */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { - if (!pol) + if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) -- cgit v1.2.3 From d14f5efadd846dbde561bd734318de6a9e6b26e6 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Fri, 13 May 2022 10:52:25 +0800 Subject: tmpfs: fix undefined-behaviour in shmem_reconfigure() When shmem_reconfigure() calls __percpu_counter_compare(), the second parameter is unsigned long long. But in the definition of __percpu_counter_compare(), the second parameter is s64. So when __percpu_counter_compare() executes abs(count - rhs), UBSAN shows the following warning: ================================================================================ UBSAN: Undefined behaviour in lib/percpu_counter.c:209:6 signed integer overflow: 0 - -9223372036854775808 cannot be represented in type 'long long int' CPU: 1 PID: 9636 Comm: syz-executor.2 Tainted: G ---------r- - 4.18.0 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack home/install/linux-rh-3-10/lib/dump_stack.c:77 [inline] dump_stack+0x125/0x1ae home/install/linux-rh-3-10/lib/dump_stack.c:117 ubsan_epilogue+0xe/0x81 home/install/linux-rh-3-10/lib/ubsan.c:159 handle_overflow+0x19d/0x1ec home/install/linux-rh-3-10/lib/ubsan.c:190 __percpu_counter_compare+0x124/0x140 home/install/linux-rh-3-10/lib/percpu_counter.c:209 percpu_counter_compare home/install/linux-rh-3-10/./include/linux/percpu_counter.h:50 [inline] shmem_remount_fs+0x1ce/0x6b0 home/install/linux-rh-3-10/mm/shmem.c:3530 do_remount_sb+0x11b/0x530 home/install/linux-rh-3-10/fs/super.c:888 do_remount home/install/linux-rh-3-10/fs/namespace.c:2344 [inline] do_mount+0xf8d/0x26b0 home/install/linux-rh-3-10/fs/namespace.c:2844 ksys_mount+0xad/0x120 home/install/linux-rh-3-10/fs/namespace.c:3075 __do_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3089 [inline] __se_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3086 [inline] __x64_sys_mount+0xbf/0x160 home/install/linux-rh-3-10/fs/namespace.c:3086 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf RIP: 0033:0x46b5e9 Code: 5d db fa ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b db fa ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f54d5f22c68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 000000000077bf60 RCX: 000000000046b5e9 RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000000 RBP: 000000000077bf60 R08: 0000000020000140 R09: 0000000000000000 R10: 00000000026740a4 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd1fb1592f R14: 00007f54d5f239c0 R15: 000000000077bf6c ================================================================================ [akpm@linux-foundation.org: tweak error message text] Link: https://lkml.kernel.org/r/20220513025225.2678727-1-luomeng12@huawei.com Signed-off-by: Luo Meng Cc: Hugh Dickins Cc: Yu Kuai Signed-off-by: Andrew Morton --- mm/shmem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 67a3f3b05fb2..7f41a24ed382 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3476,6 +3476,10 @@ static int shmem_reconfigure(struct fs_context *fc) raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if (ctx->blocks > S64_MAX) { + err = "Number of blocks too large"; + goto out; + } if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; -- cgit v1.2.3 From 3645b5ec0ad644942e363c9a695a6f4c783c0a43 Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Mon, 16 May 2022 11:00:42 +0800 Subject: mm/page_owner.c: add missing __initdata attribute This patch fixes two issues: 1. Add __initdata attribute according to include/linux/init.h: For initialized data: You should insert __initdata between the variable name and equal sign followed by value 2. Fix below error reported by checkpatch.pl: ERROR: do not initialise statics to false Special thanks to Muchun Song :) Link: https://lkml.kernel.org/r/20220516030039.1487005-1-bh1scw@gmail.com Signed-off-by: Fanjun Kong Suggested-by: Muchun Song Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 19bc559e4904..d808901e27b2 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -34,7 +34,7 @@ struct page_owner { pid_t tgid; }; -static bool page_owner_enabled = false; +static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; -- cgit v1.2.3 From 10e0f7530205799e7e971aba699a7cb3a47456de Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 19 May 2022 14:08:54 -0700 Subject: mm/page_alloc: fix tracepoint mm_page_alloc_zone_locked() Currently, trace point mm_page_alloc_zone_locked() doesn't show correct information. First, when alloc_flag has ALLOC_HARDER/ALLOC_CMA, page can be allocated from MIGRATE_HIGHATOMIC/MIGRATE_CMA. Nevertheless, tracepoint use requested migration type not MIGRATE_HIGHATOMIC and MIGRATE_CMA. Second, after commit 44042b4498728 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") percpu-list can store high order pages. But trace point determine whether it is a refiil of percpu-list by comparing requested order and 0. To handle these problems, make mm_page_alloc_zone_locked() only be called by __rmqueue_smallest with correct migration type. With a new argument called percpu_refill, it can show roughly whether it is a refill of percpu-list. Link: https://lkml.kernel.org/r/20220512025307.57924-1-vvghjk1234@gmail.com Signed-off-by: Wonhyuk Yang Acked-by: Mel Gorman Cc: Baik Song An Cc: Hong Yeon Kim Cc: Taeung Song Cc: Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 14 +++++++++----- mm/page_alloc.c | 13 +++++-------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 71c141804222..f76668305ac5 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -229,20 +229,23 @@ TRACE_EVENT(mm_page_alloc, DECLARE_EVENT_CLASS(mm_page, - TP_PROTO(struct page *page, unsigned int order, int migratetype), + TP_PROTO(struct page *page, unsigned int order, int migratetype, + int percpu_refill), - TP_ARGS(page, order, migratetype), + TP_ARGS(page, order, migratetype, percpu_refill), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) + __field( int, percpu_refill ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; + __entry->percpu_refill = percpu_refill; ), TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d", @@ -250,14 +253,15 @@ DECLARE_EVENT_CLASS(mm_page, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, - __entry->order == 0) + __entry->percpu_refill) ); DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked, - TP_PROTO(struct page *page, unsigned int order, int migratetype), + TP_PROTO(struct page *page, unsigned int order, int migratetype, + int percpu_refill), - TP_ARGS(page, order, migratetype) + TP_ARGS(page, order, migratetype, percpu_refill) ); TRACE_EVENT(mm_page_pcpu_drain, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a4a8fbbd6217..10305b10fe93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2466,6 +2466,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, del_page_from_free_list(page, zone, current_order); expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); return page; } @@ -2989,7 +2992,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, zone_page_state(zone, NR_FREE_PAGES) / 2) { page = __rmqueue_cma_fallback(zone, order); if (page) - goto out; + return page; } } retry: @@ -3002,9 +3005,6 @@ retry: alloc_flags)) goto retry; } -out: - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -3723,11 +3723,8 @@ struct page *rmqueue(struct zone *preferred_zone, * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) { + if (order > 0 && alloc_flags & ALLOC_HARDER) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); if (!page) -- cgit v1.2.3 From 3f913fc5f9745613088d3c569778c9813ab9c129 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 19 May 2022 14:08:55 -0700 Subject: mm: fix missing handler for __GFP_NOWARN We expect no warnings to be issued when we specify __GFP_NOWARN, but currently in paths like alloc_pages() and kmalloc(), there are still some warnings printed, fix it. But for some warnings that report usage problems, we don't deal with them. If such warnings are printed, then we should fix the usage problems. Such as the following case: WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); [zhengqi.arch@bytedance.com: v2] Link: https://lkml.kernel.org/r/20220511061951.1114-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220510113809.80626-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Akinobu Mita Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 2 ++ lib/fault-inject.c | 3 +++ mm/failslab.c | 3 +++ mm/internal.h | 15 +++++++++++++++ mm/page_alloc.c | 18 ++++++++++-------- 5 files changed, 33 insertions(+), 8 deletions(-) diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 2d04f6448cde..9f6e25467844 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -20,6 +20,7 @@ struct fault_attr { atomic_t space; unsigned long verbose; bool task_filter; + bool no_warn; unsigned long stacktrace_depth; unsigned long require_start; unsigned long require_end; @@ -39,6 +40,7 @@ struct fault_attr { .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \ .verbose = 2, \ .dname = NULL, \ + .no_warn = false, \ } #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER diff --git a/lib/fault-inject.c b/lib/fault-inject.c index ce12621b4275..423784d9c058 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -41,6 +41,9 @@ EXPORT_SYMBOL_GPL(setup_fault_attr); static void fail_dump(struct fault_attr *attr) { + if (attr->no_warn) + return; + if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) { printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n" "name %pd, interval %lu, probability %lu, " diff --git a/mm/failslab.c b/mm/failslab.c index f92fed91ac23..58df9789f1d2 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -30,6 +30,9 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; + if (gfpflags & __GFP_NOWARN) + failslab.attr.no_warn = true; + return should_fail(&failslab.attr, s->object_size); } diff --git a/mm/internal.h b/mm/internal.h index 6d188161b20e..64e61b032dac 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -35,6 +35,21 @@ struct folio_batch; /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +/* + * Different from WARN_ON_ONCE(), no warning will be issued + * when we specify __GFP_NOWARN. + */ +#define WARN_ON_ONCE_GFP(cond, gfp) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) + void page_writeback_init(void); static inline void *folio_raw_mapping(struct folio *folio) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10305b10fe93..267599dd9706 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3786,6 +3786,9 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; + if (gfp_mask & __GFP_NOWARN) + fail_page_alloc.attr.no_warn = true; + return should_fail(&fail_page_alloc.attr, 1 << order); } @@ -4334,7 +4337,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, */ /* Exhausted what can be done so it's blame time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + if (out_of_memory(&oc) || + WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) { *did_some_progress = 1; /* @@ -5108,7 +5112,7 @@ nopage: * All existing users of the __GFP_NOFAIL are blockable, so warn * of any new users that actually require GFP_NOWAIT */ - if (WARN_ON_ONCE(!can_direct_reclaim)) + if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) goto fail; /* @@ -5116,7 +5120,7 @@ nopage: * because we cannot reclaim anything and only can loop waiting * for somebody to do a work for us */ - WARN_ON_ONCE(current->flags & PF_MEMALLOC); + WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); /* * non failing costly orders are a hard requirement which we @@ -5124,7 +5128,7 @@ nopage: * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask); /* * Help non-failing allocations by giving them access to memory @@ -5370,10 +5374,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); + if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp)) return NULL; - } gfp &= gfp_allowed_mask; /* @@ -9025,7 +9027,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, lru_cache_enable(); if (ret < 0) { - if (ret == -EBUSY) + if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; -- cgit v1.2.3 From 37462a920392cb86541650a6f4121155f11f1199 Mon Sep 17 00:00:00 2001 From: Christophe de Dinechin Date: Thu, 14 Apr 2022 17:08:54 +0200 Subject: nodemask.h: fix compilation error with GCC12 With gcc version 12.0.1 20220401 (Red Hat 12.0.1-0), building with defconfig results in the following compilation error: | CC mm/swapfile.o | mm/swapfile.c: In function `setup_swap_info': | mm/swapfile.c:2291:47: error: array subscript -1 is below array bounds | of `struct plist_node[]' [-Werror=array-bounds] | 2291 | p->avail_lists[i].prio = 1; | | ~~~~~~~~~~~~~~^~~ | In file included from mm/swapfile.c:16: | ./include/linux/swap.h:292:27: note: while referencing `avail_lists' | 292 | struct plist_node avail_lists[]; /* | | ^~~~~~~~~~~ This is due to the compiler detecting that the mask in node_states[__state] could theoretically be zero, which would lead to first_node() returning -1 through find_first_bit. I believe that the warning/error is legitimate. I first tried adding a test to check that the node mask is not emtpy, since a similar test exists in the case where MAX_NUMNODES == 1. However, adding the if statement causes other warnings to appear in for_each_cpu_node_but, because it introduces a dangling else ambiguity. And unfortunately, GCC is not smart enough to detect that the added test makes the case where (node) == -1 impossible, so it still complains with the same message. This is why I settled on replacing that with a harmless, but relatively useless (node) >= 0 test. Based on the warning for the dangling else, I also decided to fix the case where MAX_NUMNODES == 1 by moving the condition inside the for loop. It will still only be tested once. This ensures that the meaning of an else following for_each_node_mask or derivatives would not silently have a different meaning depending on the configuration. Link: https://lkml.kernel.org/r/20220414150855.2407137-3-dinechin@redhat.com Signed-off-by: Christophe de Dinechin Signed-off-by: Christophe de Dinechin Reviewed-by: Andrew Morton Cc: Ben Segall Cc: "Michael S. Tsirkin" Cc: Steven Rostedt Cc: Ingo Molnar Cc: Mel Gorman Cc: Dietmar Eggemann Cc: Vincent Guittot Cc: Paolo Bonzini Cc: Daniel Bristot de Oliveira Cc: Jason Wang Cc: Zhen Lei Cc: Juri Lelli Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 567c3ddba2c4..c6199dbe2591 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -375,14 +375,13 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, } #if MAX_NUMNODES > 1 -#define for_each_node_mask(node, mask) \ - for ((node) = first_node(mask); \ - (node) < MAX_NUMNODES; \ - (node) = next_node((node), (mask))) +#define for_each_node_mask(node, mask) \ + for ((node) = first_node(mask); \ + (node >= 0) && (node) < MAX_NUMNODES; \ + (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ -#define for_each_node_mask(node, mask) \ - if (!nodes_empty(mask)) \ - for ((node) = 0; (node) < 1; (node)++) +#define for_each_node_mask(node, mask) \ + for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++) #endif /* MAX_NUMNODES */ /* -- cgit v1.2.3 From 2b132903de7124dd9a758be0c27562e91a510848 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 11 May 2022 12:46:53 +0300 Subject: tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate Fixes following sparse warnings: CHECK mm/vmscan.c mm/vmscan.c: note: in included file (through include/trace/trace_events.h, include/trace/define_trace.h, include/trace/events/vmscan.h): ./include/trace/events/vmscan.h:281:1: sparse: warning: cast to restricted isolate_mode_t ./include/trace/events/vmscan.h:281:1: sparse: warning: restricted isolate_mode_t degrades to integer Link: https://lkml.kernel.org/r/e85d7ff2-fd10-53f8-c24e-ba0458439c1b@openvz.org Signed-off-by: Vasily Averin Acked-by: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 408c86244d64..d2123dd960d5 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -297,7 +297,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __field(unsigned long, nr_scanned) __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) - __field(isolate_mode_t, isolate_mode) + __field(unsigned int, isolate_mode) __field(int, lru) ), @@ -308,7 +308,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->nr_scanned = nr_scanned; __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; - __entry->isolate_mode = isolate_mode; + __entry->isolate_mode = (__force unsigned int)isolate_mode; __entry->lru = lru; ), -- cgit v1.2.3 From 02e34fff195d3a5f67cbb553795dc109a37d1dcf Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 17 May 2022 22:51:20 +0800 Subject: mm: damon: use HPAGE_PMD_SIZE Use HPAGE_PMD_SIZE instead of open coding. Link: https://lkml.kernel.org/r/20220517145120.118523-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 3 +-- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index e346cc10d143..10ef20b2003f 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -73,8 +73,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) } #ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, - addr + ((1UL) << HPAGE_PMD_SHIFT))) + if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE)) referenced = true; #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 21474ae63bc7..b40ff5811bb2 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -106,7 +106,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, result->accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); - result->page_sz = ((1UL) << HPAGE_PMD_SHIFT); + result->page_sz = HPAGE_PMD_SIZE; #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 77f326323e74..59e1653799f8 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -442,7 +442,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, if (pmd_young(*pmd) || !page_is_idle(page) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT); + *priv->page_sz = HPAGE_PMD_SIZE; priv->young = true; } put_page(page); -- cgit v1.2.3 From 7fb6378701dc0d8f19c1ac4623b55f5125f0e286 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 22 May 2022 16:18:51 +0200 Subject: cgroup: fix an error handling path in alloc_pagecache_max_30M() If the first goto is taken, 'fd' is not opened yet (and is un-initialized). So a direct return is safer. Link: https://lkml.kernel.org/r/628312312eb40e0e39463a2c06415fde5295c716.1653229120.git.christophe.jaillet@wanadoo.fr Fixes: c1a31a2f7a9c ("cgroup: fix racy check in alloc_pagecache_max_30M() helper function") Signed-off-by: Christophe JAILLET Reviewed-by: Andrew Morton Cc: Dan Carpenter Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Cc: Tejun Heo Cc: Zefan Li Cc: Shuah Khan Cc: David Vernet Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 6ab94317c87b..44a974ec472c 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -574,7 +574,7 @@ static int alloc_pagecache_max_30M(const char *cgroup, void *arg) high = cg_read_long(cgroup, "memory.high"); max = cg_read_long(cgroup, "memory.max"); if (high != MB(30) && max != MB(30)) - goto cleanup; + return -1; fd = get_temp_fd(); if (fd < 0) -- cgit v1.2.3 From e384200e70664cd690cdadb72b2a1bc9dfcdec1a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 21 May 2022 19:53:04 -0700 Subject: mm/shmem: fix shmem folio swapoff hang Shmem swapoff makes no progress: the index to indices is not incremented. But "ret" is no longer a return value, so use folio_batch_count() instead. Link: https://lkml.kernel.org/r/c32bee8a-f0aa-245-f94e-24dd271924fa@google.com Fixes: da08e9b79323 ("mm/shmem: convert shmem_swapin_page() to shmem_swapin_folio()") Signed-off-by: Hugh Dickins Reviewed-by: Miaohe Lin Tested-by: Miaohe Lin Cc: Matthew Wilcox Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7f41a24ed382..b8169beff226 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1164,7 +1164,6 @@ static int shmem_find_swap_entries(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; swp_entry_t entry; - unsigned int ret = 0; rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { @@ -1178,7 +1177,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, if (swp_type(entry) != type) continue; - indices[ret] = xas.xa_index; + indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; -- cgit v1.2.3 From 6140ae41effe0fff80ff3d6f1bfdff64aa06e9b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 22 May 2022 13:40:27 -0700 Subject: zram: fix Kconfig dependency warning ZSMALLOC depends on MMU so ZRAM should also depend on MMU since 'select' does not follow any dependency chains. Fixes this Kconfig warning: WARNING: unmet direct dependencies detected for ZSMALLOC Depends on [n]: MMU [=n] Selected by [y]: - ZRAM [=y] && BLK_DEV [=y] && BLOCK [=y] && SYSFS [=y] && (CRYPTO_LZO [=y] || CRYPTO_ZSTD [=m] || CRYPTO_LZ4 [=m] || CRYPTO_LZ4HC [=n] || CRYPTO_842 [=n]) Link: https://lkml.kernel.org/r/20220522204027.22964-1-rdunlap@infradead.org Fixes: b3fbd58fcbb10 ("mm: Kconfig: simplify zswap configuration") Signed-off-by: Randy Dunlap Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index e4163d4b936b..d4100b0c083e 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config ZRAM tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS + depends on BLOCK && SYSFS && MMU depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842 select ZSMALLOC help -- cgit v1.2.3 From bb5ced41a658e51bb72d3dc651b28e344e001ac9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 21 May 2022 15:41:03 +0800 Subject: MAINTAINERS: add Muchun as co-maintainer for HugeTLB I have been focusing on mm for the past two years. e.g. developing, fixing bugs, reviewing related to HugeTLB system. I would like to help Mike and other people working on HugeTLB by reviewing their work. When I first introduced the vmemmmap reduction, I forgot to update MAINTAINERS file. Let's update it as well. And rename "HUGETLB FILESYSTEM" to "HUGETLB SUBSYSTEM" since some files are not only related to filesystem but also memory management (the name of FILESYSTEM cannot cover this area). Link: https://lkml.kernel.org/r/20220521074103.79468-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Signed-off-by: Andrew Morton --- MAINTAINERS | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 78c57046fa93..d8c18f80bcf3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9019,16 +9019,20 @@ S: Orphan F: Documentation/networking/device_drivers/ethernet/huawei/hinic.rst F: drivers/net/ethernet/huawei/hinic/ -HUGETLB FILESYSTEM +HUGETLB SUBSYSTEM M: Mike Kravetz +M: Muchun Song L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages F: Documentation/admin-guide/mm/hugetlbpage.rst F: Documentation/vm/hugetlbfs_reserv.rst +F: Documentation/vm/vmemmap_dedup.rst F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: mm/hugetlb.c +F: mm/hugetlb_vmemmap.c +F: mm/hugetlb_vmemmap.h HVA ST MEDIA DRIVER M: Jean-Christophe Trotin -- cgit v1.2.3 From 88ee134320b8311ca7a00630e5ba013cd0239350 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 24 May 2022 15:47:56 -0400 Subject: mm: fix a potential infinite loop in start_isolate_page_range() In isolate_single_pageblock() called by start_isolate_page_range(), there are some pageblock isolation issues causing a potential infinite loop when isolating a page range. This is reported by Qian Cai. 1. the pageblock was isolated by just changing pageblock migratetype without checking unmovable pages. Calling set_migratetype_isolate() to isolate pageblock properly. 2. an off-by-one error caused migrating pages unnecessarily, since the page is not crossing pageblock boundary. 3. migrating a compound page across pageblock boundary then splitting the free page later has a small race window that the free page might be allocated again, so that the code will try again, causing an potential infinite loop. Temporarily set the to-be-migrated page's pageblock to MIGRATE_ISOLATE to prevent that and bail out early if no free page is found after page migration. An additional fix to split_free_page() aims to avoid crashing in __free_one_page(). When the free page is split at the specified split_pfn_offset, free_page_order should check both the first bit of free_page_pfn and the last bit of split_pfn_offset and use the smaller one. For example, if free_page_pfn=0x10000, split_pfn_offset=0xc000, free_page_order should first be 0x8000 then 0x4000, instead of 0x4000 then 0x8000, which the original algorithm did. [akpm@linux-foundation.org: suppress min() warning] Link: https://lkml.kernel.org/r/20220524194756.1698351-1-zi.yan@sent.com Fixes: b2c9e2fbba3253 ("mm: make alloc_contig_range work at pageblock granularity") Signed-off-by: Zi Yan Reported-by: Qian Cai Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 ++++++- mm/page_isolation.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 267599dd9706..bc93a82e51e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1114,13 +1114,18 @@ void split_free_page(struct page *free_page, unsigned long flags; int free_page_order; + if (split_pfn_offset == 0) + return; + spin_lock_irqsave(&zone->lock, flags); del_page_from_free_list(free_page, zone, order); for (pfn = free_page_pfn; pfn < free_page_pfn + (1UL << order);) { int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - free_page_order = ffs(split_pfn_offset) - 1; + free_page_order = min_t(int, + pfn ? __ffs(pfn) : order, + __fls(split_pfn_offset)); __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, mt, FPI_NONE); pfn += 1UL << free_page_order; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index b3f074d1682e..c643c8420809 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -283,6 +283,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() -- tries to isolate a pageblock that might be * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross + * @flags: isolation flags * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn * @@ -298,14 +299,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * either. The function handles this by splitting the free page or migrating * the in-use page then splitting the free page. */ -static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, - bool isolate_before) +static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + gfp_t gfp_flags, bool isolate_before) { unsigned char saved_mt; unsigned long start_pfn; unsigned long isolate_pageblock; unsigned long pfn; struct zone *zone; + int ret; VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); @@ -325,7 +327,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, zone->zone_start_pfn); saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE); + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, + isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + + if (ret) + return ret; /* * Bail out early when the to-be-isolated pageblock does not form @@ -374,7 +380,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, struct page *head = compound_head(page); unsigned long head_pfn = page_to_pfn(head); - if (head_pfn + nr_pages < boundary_pfn) { + if (head_pfn + nr_pages <= boundary_pfn) { pfn = head_pfn + nr_pages; continue; } @@ -386,7 +392,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { int order; unsigned long outer_pfn; - int ret; + int page_mt = get_pageblock_migratetype(page); + bool isolate_page = !is_migrate_isolate_page(page); struct compact_control cc = { .nr_migratepages = 0, .order = -1, @@ -399,9 +406,31 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, }; INIT_LIST_HEAD(&cc.migratepages); + /* + * XXX: mark the page as MIGRATE_ISOLATE so that + * no one else can grab the freed page after migration. + * Ideally, the page should be freed as two separate + * pages to be added into separate migratetype free + * lists. + */ + if (isolate_page) { + ret = set_migratetype_isolate(page, page_mt, + flags, head_pfn, head_pfn + nr_pages); + if (ret) + goto failed; + } + ret = __alloc_contig_migrate_range(&cc, head_pfn, head_pfn + nr_pages); + /* + * restore the page's migratetype so that it can + * be split into separate migratetype free lists + * later. + */ + if (isolate_page) + unset_migratetype_isolate(page, page_mt); + if (ret) goto failed; /* @@ -417,10 +446,9 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, order = 0; outer_pfn = pfn; while (!PageBuddy(pfn_to_page(outer_pfn))) { - if (++order >= MAX_ORDER) { - outer_pfn = pfn; - break; - } + /* stop if we cannot find the free page */ + if (++order >= MAX_ORDER) + goto failed; outer_pfn &= ~0UL << order; } pfn = outer_pfn; @@ -435,7 +463,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, return 0; failed: /* restore the original migratetype */ - set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt); + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); return -EBUSY; } @@ -496,12 +524,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int ret; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, gfp_flags, false); + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false); if (ret) return ret; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, gfp_flags, true); + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; -- cgit v1.2.3 From 185194f19134716f32b2ab04ac76cf93ea85706a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 17 May 2022 09:58:14 +0300 Subject: include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" Redefines __def_gfpflag_names array according to akpm@, willy@ and Joe Perches recommendations. Link: https://lkml.kernel.org/r/6f811e19-41c6-f3e8-fca6-23a19a62e313@openvz.org Fixes: fe573327ffb1 ("tracing: incorrect gfp_t conversion") Signed-off-by: Vasily Averin Cc: Matthew Wilcox Cc: Joe Perches Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/trace/events/mmflags.h | 84 +++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index fcb1ffb2cc8d..e87cb2b80ed3 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -13,49 +13,51 @@ * Thus most bits set go first. */ -#define __def_gfpflag_names \ - {(__force unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ - {(__force unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \ - {(__force unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ - {(__force unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ - {(__force unsigned long)GFP_USER, "GFP_USER"}, \ - {(__force unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \ - {(__force unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ - {(__force unsigned long)GFP_NOFS, "GFP_NOFS"}, \ - {(__force unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ - {(__force unsigned long)GFP_NOIO, "GFP_NOIO"}, \ - {(__force unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \ - {(__force unsigned long)GFP_DMA, "GFP_DMA"}, \ - {(__force unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ - {(__force unsigned long)GFP_DMA32, "GFP_DMA32"}, \ - {(__force unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ - {(__force unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ - {(__force unsigned long)__GFP_IO, "__GFP_IO"}, \ - {(__force unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(__force unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ - {(__force unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ - {(__force unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ - {(__force unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ - {(__force unsigned long)__GFP_COMP, "__GFP_COMP"}, \ - {(__force unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \ - {(__force unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \ - {(__force unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \ - {(__force unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \ - {(__force unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \ - {(__force unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ - {(__force unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ - {(__force unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ - {(__force unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ - {(__force unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ - {(__force unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(__force unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(__force unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ +#define gfpflag_string(flag) {(__force unsigned long)flag, #flag} + +#define __def_gfpflag_names \ + gfpflag_string(GFP_TRANSHUGE), \ + gfpflag_string(GFP_TRANSHUGE_LIGHT), \ + gfpflag_string(GFP_HIGHUSER_MOVABLE), \ + gfpflag_string(GFP_HIGHUSER), \ + gfpflag_string(GFP_USER), \ + gfpflag_string(GFP_KERNEL_ACCOUNT), \ + gfpflag_string(GFP_KERNEL), \ + gfpflag_string(GFP_NOFS), \ + gfpflag_string(GFP_ATOMIC), \ + gfpflag_string(GFP_NOIO), \ + gfpflag_string(GFP_NOWAIT), \ + gfpflag_string(GFP_DMA), \ + gfpflag_string(__GFP_HIGHMEM), \ + gfpflag_string(GFP_DMA32), \ + gfpflag_string(__GFP_HIGH), \ + gfpflag_string(__GFP_ATOMIC), \ + gfpflag_string(__GFP_IO), \ + gfpflag_string(__GFP_FS), \ + gfpflag_string(__GFP_NOWARN), \ + gfpflag_string(__GFP_RETRY_MAYFAIL), \ + gfpflag_string(__GFP_NOFAIL), \ + gfpflag_string(__GFP_NORETRY), \ + gfpflag_string(__GFP_COMP), \ + gfpflag_string(__GFP_ZERO), \ + gfpflag_string(__GFP_NOMEMALLOC), \ + gfpflag_string(__GFP_MEMALLOC), \ + gfpflag_string(__GFP_HARDWALL), \ + gfpflag_string(__GFP_THISNODE), \ + gfpflag_string(__GFP_RECLAIMABLE), \ + gfpflag_string(__GFP_MOVABLE), \ + gfpflag_string(__GFP_ACCOUNT), \ + gfpflag_string(__GFP_WRITE), \ + gfpflag_string(__GFP_RECLAIM), \ + gfpflag_string(__GFP_DIRECT_RECLAIM), \ + gfpflag_string(__GFP_KSWAPD_RECLAIM), \ + gfpflag_string(__GFP_ZEROTAGS) #ifdef CONFIG_KASAN_HW_TAGS -#define __def_gfpflag_names_kasan , \ - {(__force unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ - {(__force unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ - {(__force unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} +#define __def_gfpflag_names_kasan , \ + gfpflag_string(__GFP_SKIP_ZERO), \ + gfpflag_string(__GFP_SKIP_KASAN_POISON), \ + gfpflag_string(__GFP_SKIP_KASAN_UNPOISON) #else #define __def_gfpflag_names_kasan #endif -- cgit v1.2.3 From e5c3f619a04d9e191de66d5f8e069367f73a6487 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 17 May 2022 09:55:51 +0300 Subject: include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" Fix sparse warning about incorrect gfp_t cast. Link: https://lkml.kernel.org/r/001979f3-e978-0998-cbed-61a4a2ac87b8@openvz.org Fixes: f67bed134a05 ("percpu: improve percpu_alloc_percpu event trace") Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton --- include/trace/events/percpu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h index e989cefc0def..5b8211ca8950 100644 --- a/include/trace/events/percpu.h +++ b/include/trace/events/percpu.h @@ -28,7 +28,7 @@ TRACE_EVENT(percpu_alloc_percpu, __field( int, off ) __field( void __percpu *, ptr ) __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) + __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->call_site = call_site; @@ -40,7 +40,7 @@ TRACE_EVENT(percpu_alloc_percpu, __entry->off = off; __entry->ptr = ptr; __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; + __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s", -- cgit v1.2.3 From 83d7d04f9d2ef354858b2a8444aee38e41ec1699 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 May 2022 15:31:05 +0800 Subject: mm/kfence: print disabling or re-enabling message By printing information, we can friendly prompt the status change information of kfence by dmesg and record by syslog. Also, set kfence_enabled to false only when needed. Link: https://lkml.kernel.org/r/20220518073105.3160335-1-liu.yun@linux.dev Signed-off-by: Jackie Liu Co-developed-by: Marco Elver Signed-off-by: Marco Elver Reviewed-by: Marco Elver Signed-off-by: Andrew Morton --- mm/kfence/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 6e69986c3f0d..2b9f89749154 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -69,8 +69,11 @@ static int param_set_sample_interval(const char *val, const struct kernel_param if (ret < 0) return ret; - if (!num) /* Using 0 to indicate KFENCE is disabled. */ + /* Using 0 to indicate KFENCE is disabled. */ + if (!num && READ_ONCE(kfence_enabled)) { + pr_info("disabled\n"); WRITE_ONCE(kfence_enabled, false); + } *((unsigned long *)kp->arg) = num; @@ -898,6 +901,7 @@ static int kfence_enable_late(void) WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + pr_info("re-enabled\n"); return 0; } -- cgit v1.2.3 From 3f1509c57b1ba5646de0fb8d81bd7107aec22257 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 18 May 2022 15:09:11 -0400 Subject: Revert "mm/vmscan: never demote for memcg reclaim" This reverts commit 3a235693d3930e1276c8d9cc0ca5807ef292cf0a. Its premise was that cgroup reclaim cares about freeing memory inside the cgroup, and demotion just moves them around within the cgroup limit. Hence, pages from toptier nodes should be reclaimed directly. However, with NUMA balancing now doing tier promotions, demotion is part of the page aging process. Global reclaim demotes the coldest toptier pages to secondary memory, where their life continues and from which they have a chance to get promoted back. Essentially, tiered memory systems have an LRU order that spans multiple nodes. When cgroup reclaims pages coming off the toptier directly, there can be colder pages on lower tier nodes that were demoted by global reclaim. This is an aging inversion, not unlike if cgroups were to reclaim directly from the active lists while there are inactive pages. Proactive reclaim is another factor. The goal of that it is to offload colder pages from expensive RAM to cheaper storage. When lower tier memory is available as an intermediate layer, we want offloading to take advantage of it instead of bypassing to storage. Revert the patch so that cgroups respect the LRU order spanning the memory hierarchy. Of note is a specific undercommit scenario, where all cgroup limits in the system add up to <= available toptier memory. In that case, shuffling pages out to lower tiers first to reclaim them from there is inefficient. This is something could be optimized/short-circuited later on (although care must be taken not to accidentally recreate the aging inversion). Let's ensure correctness first. Link: https://lkml.kernel.org/r/20220518190911.82400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Dave Hansen Reviewed-by: Yang Shi Acked-by: Roman Gushchin Reviewed-by: "Huang, Ying" Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Tim Chen Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 887edcd93a40..76bca20679d9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -528,13 +528,8 @@ static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) return false; - if (sc) { - if (sc->no_demotion) - return false; - /* It is pointless to do demotion in memcg reclaim */ - if (cgroup_reclaim(sc)) - return false; - } + if (sc && sc->no_demotion) + return false; if (next_demotion_node(nid) == NUMA_NO_NODE) return false; -- cgit v1.2.3 From 33776141b81296e604a39a8065b28b89c61c7f74 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 18 May 2022 13:43:16 -0700 Subject: selftests: vm: add process_mrelease tests Introduce process_mrelease syscall sanity tests which include tests which expect to fail: - process_mrelease with invalid pidfd and flags inputs - process_mrelease on a live process with no pending signals and valid process_mrelease usage which is expected to succeed. Because process_mrelease has to be used against a process with a pending SIGKILL, it's possible that the process exits before process_mrelease gets called. In such cases we retry the test with a victim that allocates twice more memory up to 1GB. This would require the victim process to spend more time during exit and process_mrelease has a better chance of catching the process before it exits and succeeding. On success the test reports the amount of memory the child had to allocate for reaping to succeed. Sample output: $ mrelease_test Success reaping a child with 1MB of memory allocations On failure the test reports the failure. Sample outputs: $ mrelease_test All process_mrelease attempts failed! $ mrelease_test process_mrelease: Invalid argument Link: https://lkml.kernel.org/r/20220518204316.13131-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shuah Khan Acked-by: Christian Brauner (Microsoft) Reviewed-by: Muhammad Usama Anjum Cc: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Roman Gushchin Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Peter Xu Cc: John Hubbard Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/mrelease_test.c | 200 +++++++++++++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 2 + 4 files changed, 204 insertions(+) create mode 100644 tools/testing/selftests/vm/mrelease_test.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 3cb4fa771ec2..6c2ac4208c27 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -10,6 +10,7 @@ map_populate thuge-gen compaction_test mlock2-tests +mrelease_test mremap_dontunmap mremap_test on-fault-limit diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index f1228370e99b..8111a33e4824 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -44,6 +44,7 @@ TEST_GEN_FILES += memfd_secret TEST_GEN_FILES += migration TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test TEST_GEN_FILES += on-fault-limit diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c new file mode 100644 index 000000000000..96671c2f7d48 --- /dev/null +++ b/tools/testing/selftests/vm/mrelease_test.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2022 Google LLC + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "util.h" + +#include "../kselftest.h" + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open -1 +#endif + +#ifndef __NR_process_mrelease +#define __NR_process_mrelease -1 +#endif + +#define MB(x) (x << 20) +#define MAX_SIZE_MB 1024 + +static int alloc_noexit(unsigned long nr_pages, int pipefd) +{ + int ppid = getppid(); + int timeout = 10; /* 10sec timeout to get killed */ + unsigned long i; + char *buf; + + buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, 0, 0); + if (buf == MAP_FAILED) { + perror("mmap failed, halting the test"); + return KSFT_FAIL; + } + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; + + /* Signal the parent that the child is ready */ + if (write(pipefd, "", 1) < 0) { + perror("write"); + return KSFT_FAIL; + } + + /* Wait to be killed (when reparenting happens) */ + while (getppid() == ppid && timeout > 0) { + sleep(1); + timeout--; + } + + munmap(buf, nr_pages * PAGE_SIZE); + + return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; +} + +/* The process_mrelease calls in this test are expected to fail */ +static void run_negative_tests(int pidfd) +{ + /* Test invalid flags. Expect to fail with EINVAL error code. */ + if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || + errno != EINVAL) { + perror("process_mrelease with wrong flags"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + /* + * Test reaping while process is alive with no pending SIGKILL. + * Expect to fail with EINVAL error code. + */ + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + perror("process_mrelease on a live process"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } +} + +static int child_main(int pipefd[], size_t size) +{ + int res; + + /* Allocate and fault-in memory and wait to be killed */ + close(pipefd[0]); + res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); + close(pipefd[1]); + return res; +} + +int main(void) +{ + int pipefd[2], pidfd; + bool success, retry; + size_t size; + pid_t pid; + char byte; + int res; + + /* Test a wrong pidfd */ + if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + perror("process_mrelease with wrong pidfd"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + + /* Start the test with 1MB child memory allocation */ + size = 1; +retry: + /* + * Pipe for the child to signal when it's done allocating + * memory + */ + if (pipe(pipefd)) { + perror("pipe"); + exit(KSFT_FAIL); + } + pid = fork(); + if (pid < 0) { + perror("fork"); + close(pipefd[0]); + close(pipefd[1]); + exit(KSFT_FAIL); + } + + if (pid == 0) { + /* Child main routine */ + res = child_main(pipefd, size); + exit(res); + } + + /* + * Parent main routine: + * Wait for the child to finish allocations, then kill and reap + */ + close(pipefd[1]); + /* Block until the child is ready */ + res = read(pipefd[0], &byte, 1); + close(pipefd[0]); + if (res < 0) { + perror("read"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + perror("pidfd_open"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + /* Run negative tests which require a live child */ + run_negative_tests(pidfd); + + if (kill(pid, SIGKILL)) { + perror("kill"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + + success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); + if (!success) { + /* + * If we failed to reap because the child exited too soon, + * before we could call process_mrelease. Double child's memory + * which causes it to spend more time on cleanup and increases + * our chances of reaping its memory before it exits. + * Retry until we succeed or reach MAX_SIZE_MB. + */ + if (errno == ESRCH) { + retry = (size <= MAX_SIZE_MB); + } else { + perror("process_mrelease"); + waitpid(pid, NULL, 0); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + } + + /* Cleanup to prevent zombies */ + if (waitpid(pid, NULL, 0) < 0) { + perror("waitpid"); + exit(KSFT_FAIL); + } + close(pidfd); + + if (!success) { + if (retry) { + size *= 2; + goto retry; + } + printf("All process_mrelease attempts failed!\n"); + exit(KSFT_FAIL); + } + + printf("Success reaping a child with %zuMB of memory allocations\n", + size); + return KSFT_PASS; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index a2302b5faaf2..41fce8bea929 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -141,6 +141,8 @@ run_test ./mlock-random-test run_test ./mlock2-tests +run_test ./mrelease_test + run_test ./mremap_test run_test ./thuge-gen -- cgit v1.2.3 From 3413b2c872c3fe5cf3b11d4e73de55098c81c7a3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:44 +0200 Subject: ksm: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20220521111145.81697-94-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9ee82c9bce94..54f78c9eecae 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1590,7 +1590,7 @@ again: * the rbtree instead as a regular stable_node (in * order to collapse the stable_node chain if a single * stable_node dup was found in it). In such case the - * stable_node is overwritten by the calleee to point + * stable_node is overwritten by the callee to point * to the stable_node_dup that was collapsed in the * stable rbtree and stable_node will be equal to * stable_node_dup like if the chain never existed. -- cgit v1.2.3 From 75c96ccea2e1de1342996722ee505d2cadedc0dd Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:30 +0200 Subject: selftests/vm/pkeys: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20220521111145.81697-80-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/protection_keys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 2d0ae88665db..291bc1e07842 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1523,7 +1523,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) /* * Reset the shadow, assuming that the above mprotect() * correctly changed PKRU, but to an unknown value since - * the actual alllocated pkey is unknown. + * the actual allocated pkey is unknown. */ shadow_pkey_reg = __read_pkey_reg(); -- cgit v1.2.3 From 3d3921ed271b0e23d60c91fcad089f2f5e71af98 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Sat, 21 May 2022 14:43:13 +0500 Subject: selftests: vm: add migration to the .gitignore Add newly added migration test object to .gitignore file. Link: https://lkml.kernel.org/r/20220521094313.166505-1-usama.anjum@collabora.com Fixes: 0c2d08728470 ("mm: add selftests for migration entries") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Alistair Popple Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 6c2ac4208c27..31e5eea2a9b9 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -9,6 +9,7 @@ map_hugetlb map_populate thuge-gen compaction_test +migration mlock2-tests mrelease_test mremap_dontunmap -- cgit v1.2.3 From 9aa1af954db02a3228763015356684a169503c68 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 21 May 2022 16:38:23 +0800 Subject: selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests Patch series "selftests: vm: a few fixup patches". This series contains three fixup patches for vm selftests. They are independent. Please see the patches. This patch (of 3): Currently, ksm_tests operates "merge_across_nodes" with NUMA either enabled or disabled. In a system with NUMA disabled, these operations will fail and output a misleading report given "merge_across_nodes" does not exist in sysfs: ---------------------------- running ./ksm_tests -M -p 10 ---------------------------- f /sys/kernel/mm/ksm/merge_across_nodes fopen: No such file or directory Cannot save default tunables [FAIL] ---------------------- So check numa_available() before those operations to skip them if NUMA is disabled. Link: https://lkml.kernel.org/r/20220521083825.319654-1-patrick.wang.shcn@gmail.com Link: https://lkml.kernel.org/r/20220521083825.319654-2-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/ksm_tests.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index fd85f15869d1..2fcf24312da8 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -221,7 +221,8 @@ static bool assert_ksm_pages_count(long dupl_page_count) static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) { if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || - ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + numa_available() ? 0 : + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || @@ -236,7 +237,8 @@ static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) static int ksm_restore(struct ksm_sysfs *ksm_sysfs) { if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || - ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || @@ -720,7 +722,8 @@ int main(int argc, char *argv[]) if (ksm_write_sysfs(KSM_FP("run"), 2) || ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || - ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) return KSFT_FAIL; -- cgit v1.2.3 From ccd2a1201d267bf6f1950bf31cfd55fb4e17a231 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 21 May 2022 16:38:24 +0800 Subject: selftests: vm: add "test_hmm.sh" to TEST_FILES The "test_hmm.sh" file used by run_vmtests.sh dose not be installed into INSTALL_PATH. Thus run_vmtests.sh can not call it in INSTALL_PATH: --------------------------- running ./test_hmm.sh smoke --------------------------- ./run_vmtests.sh: line 74: ./test_hmm.sh: No such file or directory [FAIL] ----------------------- Add "test_hmm.sh" to TEST_FILES so that it will be installed. Link: https://lkml.kernel.org/r/20220521083825.319654-3-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 8111a33e4824..064bfae6dd0d 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -92,6 +92,7 @@ endif TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh +TEST_FILES += test_hmm.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk -- cgit v1.2.3 From 0598739900071feff82b89f1515f963a6889b330 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 21 May 2022 16:38:25 +0800 Subject: selftests: vm: add the "settings" file with timeout variable The default "timeout" for one kselftest is 45 seconds, while some cases in run_vmtests.sh require more time. This will cause testing timeout like: not ok 4 selftests: vm: run_vmtests.sh # TIMEOUT 45 seconds Therefore, add the "settings" file with timeout variable so users can set the "timeout" value. Link: https://lkml.kernel.org/r/20220521083825.319654-4-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/vm/settings diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/vm/settings new file mode 100644 index 000000000000..9abfc60e9e6f --- /dev/null +++ b/tools/testing/selftests/vm/settings @@ -0,0 +1 @@ +timeout=45 -- cgit v1.2.3 From f403f22f8ccb12860b2b62fec3173c6ccd45938b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 20 May 2022 10:18:33 +0800 Subject: mm: kfence: use PAGE_ALIGNED helper Use PAGE_ALIGNED macro instead of IS_ALIGNED and passing PAGE_SIZE. Link: https://lkml.kernel.org/r/20220520021833.121405-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Muchun Song Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kfence/kfence_test.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 1b50f70a4c0f..21aafc95f4ab 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -296,10 +296,9 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat if (policy == ALLOCATE_ANY) return alloc; - if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + if (policy == ALLOCATE_LEFT && PAGE_ALIGNED(alloc)) return alloc; - if (policy == ALLOCATE_RIGHT && - !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + if (policy == ALLOCATE_RIGHT && !PAGE_ALIGNED(alloc)) return alloc; } else if (policy == ALLOCATE_NONE) return alloc; -- cgit v1.2.3