diff options
Diffstat (limited to 'fs/proc/task_mmu.c')
-rw-r--r-- | fs/proc/task_mmu.c | 267 |
1 files changed, 227 insertions, 40 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f76f16..dbf61f6174f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -54,7 +55,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPTE:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), @@ -90,10 +91,55 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } +#ifdef CONFIG_NUMA +/* + * These functions are for numa_maps but called in generic **maps seq_file + * ->start(), ->stop() ops. + * + * numa_maps scans all vmas under mmap_sem and checks their mempolicy. + * Each mempolicy object is controlled by reference counting. The problem here + * is how to avoid accessing dead mempolicy object. + * + * Because we're holding mmap_sem while reading seq_file, it's safe to access + * each vma's mempolicy, no vma objects will never drop refs to mempolicy. + * + * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy + * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). + * So, without task_lock(), we cannot trust get_vma_policy() because we cannot + * gurantee the task never exits under us. But taking task_lock() around + * get_vma_plicy() causes lock order problem. + * + * To access task->mempolicy without lock, we hold a reference count of an + * object pointed by task->mempolicy and remember it. This will guarantee + * that task->mempolicy points to an alive object or NULL in numa_maps accesses. + */ +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ + struct task_struct *task = priv->task; + + task_lock(task); + priv->task_mempolicy = task->mempolicy; + mpol_get(priv->task_mempolicy); + task_unlock(task); +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ + mpol_put(priv->task_mempolicy); +} +#else +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ +} +#endif + static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; + release_task_mempolicy(priv); up_read(&mm->mmap_sem); mmput(mm); } @@ -132,7 +178,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) tail_vma = get_gate_vma(priv->task->mm); priv->tail_vma = tail_vma; - + hold_task_mempolicy(priv); /* Start with last addr hint */ vma = find_vma(mm, last_addr); if (last_addr && vma) { @@ -159,6 +205,7 @@ out: if (vma) return vma; + release_task_mempolicy(priv); /* End of vmas has been reached */ m->version = (tail_vma != NULL)? 0: -1UL; up_read(&mm->mmap_sem); @@ -225,7 +272,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) const char *name = NULL; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -480,6 +527,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_DENYWRITE)] = "dw", + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_NONLINEAR)] = "nl", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_DONTDUMP)] = "dd", + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) { + seq_printf(m, "%c%c ", + mnemonics[i][0], mnemonics[i][1]); + } + } + seq_putc(m, '\n'); +} + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -535,6 +633,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) seq_printf(m, "Nonlinear: %8lu kB\n", mss.nonlinear >> 10); + show_smap_vma_flags(m, vma); + if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task->mm)) ? vma->vm_start : 0; @@ -589,15 +689,63 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + struct vm_area_struct *vma; + enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; - split_huge_page_pmd(walk->mm, pmd); + split_huge_page_pmd(vma, addr, pmd); if (pmd_trans_unstable(pmd)) return 0; @@ -607,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -620,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -631,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -639,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; - task = get_proc_task(file->f_path.dentry->d_inode); + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + .type = type, + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* @@ -674,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -695,6 +861,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -708,14 +875,17 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -738,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -748,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -767,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -788,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -806,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -833,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -841,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -856,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -877,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -916,7 +1092,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; struct pagemapread pm; int ret = -ESRCH; @@ -939,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1011,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ @@ -1080,7 +1266,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; nid = page_to_nid(page); - if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; @@ -1158,6 +1344,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; + struct task_struct *task = proc_priv->task; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; struct mempolicy *pol; @@ -1177,8 +1364,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.private = md; walk.mm = mm; - pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol, 0); + pol = get_vma_policy(task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1189,7 +1376,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_printf(m, " heap"); } else { - pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); + pid_t tid = vm_is_stack(task, vma, is_pid); if (tid != 0) { /* * Thread stack in /proc/PID/task/TID/maps or @@ -1232,7 +1419,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); - for_each_node_state(n, N_HIGH_MEMORY) + for_each_node_state(n, N_MEMORY) if (md->node[n]) seq_printf(m, " N%d=%lu", n, md->node[n]); out: |