summaryrefslogtreecommitdiff
path: root/mm/pagewalk.c
blob: 2a88dfa58af6bff741f534a25f05a3e8e7eba933 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>

/*
 * Check the current skip status of page table walker.
 *
 * Here what I mean by skip is to skip lower level walking, and that was
 * determined for each entry independently. For example, when walk_pmd_range
 * handles a pmd_trans_huge we don't have to walk over ptes under that pmd,
 * and the skipping does not affect the walking over ptes under other pmds.
 * That's why we reset @walk->skip after tested.
 */
static bool skip_lower_level_walking(struct mm_walk *walk)
{
	if (walk->skip) {
		walk->skip = 0;
		return true;
	}
	return false;
}

static int walk_pte_range(pmd_t *pmd, unsigned long addr,
				unsigned long end, struct mm_walk *walk)
{
	struct mm_struct *mm = walk->mm;
	pte_t *pte;
	pte_t *orig_pte;
	spinlock_t *ptl;
	int err = 0;

	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	do {
		if (pte_none(*pte)) {
			if (walk->pte_hole)
				err = walk->pte_hole(addr, addr + PAGE_SIZE,
							walk);
			if (err)
				break;
			continue;
		}
		/*
		 * Callers should have their own way to handle swap entries
		 * in walk->pte_entry().
		 */
		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
		if (err)
		       break;
	} while (pte++, addr += PAGE_SIZE, addr < end);
	pte_unmap_unlock(orig_pte, ptl);
	cond_resched();
	return addr == end ? 0 : err;
}

static int walk_pmd_range(pud_t *pud, unsigned long addr,
				unsigned long end, struct mm_walk *walk)
{
	pmd_t *pmd;
	unsigned long next;
	int err = 0;

	pmd = pmd_offset(pud, addr);
	do {
again:
		next = pmd_addr_end(addr, end);

		if (pmd_none(*pmd)) {
			if (walk->pte_hole)
				err = walk->pte_hole(addr, next, walk);
			if (err)
				break;
			continue;
		}

		if (walk->pmd_entry) {
			err = walk->pmd_entry(pmd, addr, next, walk);
			if (skip_lower_level_walking(walk))
				continue;
			if (err)
				break;
		}

		if (walk->pte_entry) {
			if (walk->vma) {
				split_huge_page_pmd(walk->vma, addr, pmd);
				if (pmd_trans_unstable(pmd))
					goto again;
			}
			err = walk_pte_range(pmd, addr, next, walk);
			if (err)
				break;
		}
	} while (pmd++, addr = next, addr < end);

	return err;
}

static int walk_pud_range(pgd_t *pgd, unsigned long addr,
				unsigned long end, struct mm_walk *walk)
{
	pud_t *pud;
	unsigned long next;
	int err = 0;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);

		if (pud_none_or_clear_bad(pud)) {
			if (walk->pte_hole)
				err = walk->pte_hole(addr, next, walk);
			if (err)
				break;
			continue;
		}

		if (walk->pud_entry) {
			err = walk->pud_entry(pud, addr, next, walk);
			if (skip_lower_level_walking(walk))
				continue;
			if (err)
				break;
		}

		if (walk->pmd_entry || walk->pte_entry) {
			err = walk_pmd_range(pud, addr, next, walk);
			if (err)
				break;
		}
	} while (pud++, addr = next, addr < end);

	return err;
}

static int walk_pgd_range(unsigned long addr, unsigned long end,
			struct mm_walk *walk)
{
	pgd_t *pgd;
	unsigned long next;
	int err = 0;

	pgd = pgd_offset(walk->mm, addr);
	do {
		next = pgd_addr_end(addr, end);

		if (pgd_none_or_clear_bad(pgd)) {
			if (walk->pte_hole)
				err = walk->pte_hole(addr, next, walk);
			if (err)
				break;
			continue;
		}

		if (walk->pgd_entry) {
			err = walk->pgd_entry(pgd, addr, next, walk);
			if (skip_lower_level_walking(walk))
				continue;
			if (err)
				break;
		}

		if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
			err = walk_pud_range(pgd, addr, next, walk);
			if (err)
				break;
		}
	} while (pgd++, addr = next, addr < end);

	return err;
}

#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
				       unsigned long end)
{
	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
	return boundary < end ? boundary : end;
}

static int walk_hugetlb_range(unsigned long addr, unsigned long end,
				struct mm_walk *walk)
{
	struct mm_struct *mm = walk->mm;
	struct vm_area_struct *vma = walk->vma;
	struct hstate *h = hstate_vma(vma);
	unsigned long next;
	unsigned long hmask = huge_page_mask(h);
	pte_t *pte;
	int err = 0;
	spinlock_t *ptl;

	do {
		next = hugetlb_entry_end(h, addr, end);
		pte = huge_pte_offset(walk->mm, addr & hmask);
		ptl = huge_pte_lock(h, mm, pte);
		/*
		 * Callers should have their own way to handle swap entries
		 * in walk->hugetlb_entry().
		 */
		if (pte && walk->hugetlb_entry)
			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
		spin_unlock(ptl);
		if (err)
			break;
	} while (addr = next, addr != end);
	cond_resched();
	return err;
}

#else /* CONFIG_HUGETLB_PAGE */
static inline int walk_hugetlb_range(unsigned long addr, unsigned long end,
				struct mm_walk *walk)
{
	return 0;
}

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * Decide whether we really walk over the current vma on [@start, @end)
 * or skip it. When we skip it, we set @walk->skip to 1.
 * The return value is used to control the page table walking to
 * continue (for zero) or not (for non-zero).
 *
 * Default check (only VM_PFNMAP check for now) is used when the caller
 * doesn't define test_walk() callback.
 */
static int walk_page_test(unsigned long start, unsigned long end,
			struct mm_walk *walk)
{
	struct vm_area_struct *vma = walk->vma;

	if (walk->test_walk)
		return walk->test_walk(start, end, walk);

	/*
	 * Do not walk over vma(VM_PFNMAP), because we have no valid struct
	 * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
	 */
	if (vma->vm_flags & VM_PFNMAP)
		walk->skip = 1;
	return 0;
}

static int __walk_page_range(unsigned long start, unsigned long end,
			struct mm_walk *walk)
{
	int err = 0;
	struct vm_area_struct *vma = walk->vma;

	if (vma && is_vm_hugetlb_page(vma)) {
		if (walk->hugetlb_entry)
			err = walk_hugetlb_range(start, end, walk);
	} else
		err = walk_pgd_range(start, end, walk);

	return err;
}

/**
 * walk_page_range - walk page table with caller specific callbacks
 *
 * Recursively walk the page table tree of the process represented by
 * @walk->mm within the virtual address range [@start, @end). In walking,
 * we can call caller-specific callback functions against each entry.
 *
 * Before starting to walk page table, some callers want to check whether
 * they really want to walk over the vma (for example by checking vm_flags.)
 * walk_page_test() and @walk->test_walk() do that check.
 *
 * If any callback returns a non-zero value, the page table walk is aborted
 * immediately and the return value is propagated back to the caller.
 * Note that the meaning of the positive returned value can be defined
 * by the caller for its own purpose.
 *
 * If the caller defines multiple callbacks in different levels, the
 * callbacks are called in depth-first manner. It could happen that
 * multiple callbacks are called on a address. For example if some caller
 * defines test_walk(), pmd_entry(), and pte_entry(), then callbacks are
 * called in the order of test_walk(), pmd_entry(), and pte_entry().
 * If you don't want to go down to lower level at some point and move to
 * the next entry in the same level, you set @walk->skip to 1.
 * For example if you succeed to handle some pmd entry as trans_huge entry,
 * you need not call walk_pte_range() any more, so set it to avoid that.
 * We can't determine whether to go down to lower level with the return
 * value of the callback, because the whole range of return values (0, >0,
 * and <0) are used up for other meanings.
 *
 * Each callback can access to the vma over which it is doing page table
 * walk right now via @walk->vma. @walk->vma is set to NULL in walking
 * outside a vma. If you want to access to some caller-specific data from
 * callbacks, @walk->private should be helpful.
 *
 * The callers should hold @walk->mm->mmap_sem. Note that the lower level
 * iterators can take page table lock in lowest level iteration and/or
 * in split_huge_page_pmd().
 */
int walk_page_range(unsigned long start, unsigned long end,
		    struct mm_walk *walk)
{
	int err = 0;
	struct vm_area_struct *vma;
	unsigned long next;

	if (start >= end)
		return -EINVAL;

	if (!walk->mm)
		return -EINVAL;

	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));

	do {
		vma = find_vma(walk->mm, start);
		if (!vma) { /* after the last vma */
			walk->vma = NULL;
			next = end;
		} else if (start < vma->vm_start) { /* outside the found vma */
			walk->vma = NULL;
			next = vma->vm_start;
		} else { /* inside the found vma */
			walk->vma = vma;
			next = vma->vm_end;
			err = walk_page_test(start, end, walk);
			if (skip_lower_level_walking(walk))
				continue;
			if (err)
				break;
		}
		err = __walk_page_range(start, next, walk);
		if (err)
			break;
	} while (start = next, start < end);
	return err;
}

int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
{
	int err;

	if (!walk->mm)
		return -EINVAL;

	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
	VM_BUG_ON(!vma);
	walk->vma = vma;
	err = walk_page_test(vma->vm_start, vma->vm_end, walk);
	if (skip_lower_level_walking(walk))
		return 0;
	if (err)
		return err;
	return __walk_page_range(vma->vm_start, vma->vm_end, walk);
}