From 80119ef5c8153e0a6cc5edf00c083dc98a9bd348 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 23 May 2008 13:04:31 -0700 Subject: mm: fix atomic_t overflow in vm The atomic_t type is 32bit but a 64bit system can have more than 2^32 pages of virtual address space available. Without this we overflow on ludicrously large mappings Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 74a323d2b850..32dc14cd8900 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -139,7 +139,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, #define K(x) ((x) << (PAGE_SHIFT - 10)) si_meminfo(&i); si_swapinfo(&i); - committed = atomic_read(&vm_committed_space); + committed = atomic_long_read(&vm_committed_space); allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; -- cgit v1.2.3 From c4185a0e019387f5ad6e99009804965531fa1fab Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 23 May 2008 13:04:47 -0700 Subject: proc: proc_get_inode() should get module only once Any file under /proc/net opened more than once leaked the refcounter on the module it belongs to. The problem is that module_get is called for each file opening while module_put is called only when /proc inode is destroyed. So, lets put module counter if we are dealing with already initialised inode. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=10737 Signed-off-by: Denis V. Lunev Cc: David Miller Cc: Patrick McHardy Acked-by: Pavel Emelyanov Acked-by: Robert Olsson Acked-by: Eric W. Biederman Reported-by: Roland Kletzing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6f4e8dc97da1..b08d10017911 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -425,7 +425,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } } unlock_new_inode(inode); - } + } else + module_put(de->owner); return inode; out_ino: -- cgit v1.2.3 From ca05a99a54db1db5bca72eccb5866d2a86f8517f Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Tue, 27 May 2008 22:05:17 -0700 Subject: capabilities: remain source compatible with 32-bit raw legacy capability support. Source code out there hard-codes a notion of what the _LINUX_CAPABILITY_VERSION #define means in terms of the semantics of the raw capability system calls capget() and capset(). Its unfortunate, but true. Since the confusing header file has been in a released kernel, there is software that is erroneously using 64-bit capabilities with the semantics of 32-bit compatibilities. These recently compiled programs may suffer corruption of their memory when sys_getcap() overwrites more memory than they are coded to expect, and the raising of added capabilities when using sys_capset(). As such, this patch does a number of things to clean up the situation for all. It 1. forces the _LINUX_CAPABILITY_VERSION define to always retain its legacy value. 2. adopts a new #define strategy for the kernel's internal implementation of the preferred magic. 3. deprecates v2 capability magic in favor of a new (v3) magic number. The functionality of v3 is entirely equivalent to v2, the only difference being that the v2 magic causes the kernel to log a "deprecated" warning so the admin can find applications that may be using v2 inappropriately. [User space code continues to be encouraged to use the libcap API which protects the application from details like this. libcap-2.10 is the first to support v3 capabilities.] Fixes issue reported in https://bugzilla.redhat.com/show_bug.cgi?id=447518. Thanks to Bojan Smojver for the report. [akpm@linux-foundation.org: s/depreciate/deprecate/g] [akpm@linux-foundation.org: be robust about put_user size] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrew G. Morgan Cc: Serge E. Hallyn Cc: Bojan Smojver Cc: stable@kernel.org Signed-off-by: Andrew Morton Signed-off-by: Chris Wright --- fs/proc/array.c | 2 +- include/linux/capability.h | 29 ++++++++---- kernel/capability.c | 111 +++++++++++++++++++++++++++++---------------- 3 files changed, 95 insertions(+), 47 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 9e3b8c33c24b..797d775e0354 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -288,7 +288,7 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_printf(m, "%s", header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", - a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]); + a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); } seq_printf(m, "\n"); } diff --git a/include/linux/capability.h b/include/linux/capability.h index f4ea0dd9a618..fa830f8de032 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -31,11 +31,11 @@ struct task_struct; #define _LINUX_CAPABILITY_VERSION_1 0x19980330 #define _LINUX_CAPABILITY_U32S_1 1 -#define _LINUX_CAPABILITY_VERSION_2 0x20071026 +#define _LINUX_CAPABILITY_VERSION_2 0x20071026 /* deprecated - use v3 */ #define _LINUX_CAPABILITY_U32S_2 2 -#define _LINUX_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_2 -#define _LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_2 +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 typedef struct __user_cap_header_struct { __u32 version; @@ -77,10 +77,23 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; -#ifdef __KERNEL__ +#ifndef __KERNEL__ + +/* + * Backwardly compatible definition for source code - trapped in a + * 32-bit world. If you find you need this, please consider using + * libcap to untrap yourself... + */ +#define _LINUX_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_1 +#define _LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_1 + +#else + +#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 +#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 typedef struct kernel_cap_struct { - __u32 cap[_LINUX_CAPABILITY_U32S]; + __u32 cap[_KERNEL_CAPABILITY_U32S]; } kernel_cap_t; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) @@ -351,7 +364,7 @@ typedef struct kernel_cap_struct { */ #define CAP_FOR_EACH_U32(__capi) \ - for (__capi = 0; __capi < _LINUX_CAPABILITY_U32S; ++__capi) + for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi) # define CAP_FS_MASK_B0 (CAP_TO_MASK(CAP_CHOWN) \ | CAP_TO_MASK(CAP_DAC_OVERRIDE) \ @@ -361,7 +374,7 @@ typedef struct kernel_cap_struct { # define CAP_FS_MASK_B1 (CAP_TO_MASK(CAP_MAC_OVERRIDE)) -#if _LINUX_CAPABILITY_U32S != 2 +#if _KERNEL_CAPABILITY_U32S != 2 # error Fix up hand-coded capability macro initializers #else /* HAND-CODED capability initializers */ @@ -372,7 +385,7 @@ typedef struct kernel_cap_struct { # define CAP_NFSD_SET ((kernel_cap_t){{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \ CAP_FS_MASK_B1 } }) -#endif /* _LINUX_CAPABILITY_U32S != 2 */ +#endif /* _KERNEL_CAPABILITY_U32S != 2 */ #define CAP_INIT_INH_SET CAP_EMPTY_SET diff --git a/kernel/capability.c b/kernel/capability.c index 39e8193b41ea..cfbe44299488 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -52,6 +52,69 @@ static void warn_legacy_capability_use(void) } } +/* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + /* * For sys_getproccap() and sys_setproccap(), any of the three * capability set pointers may be NULL -- indicating that that set is @@ -71,27 +134,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - __u32 version; struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -118,7 +167,7 @@ out: spin_unlock(&task_capability_lock); if (!ret) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { @@ -128,7 +177,7 @@ out: } /* - * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability @@ -240,30 +289,16 @@ static inline int cap_set_all(kernel_cap_t *effective, */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - __u32 version; struct task_struct *target; int ret; pid_t pid; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -281,7 +316,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } - while (i < _LINUX_CAPABILITY_U32S) { + while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; -- cgit v1.2.3 From aae8679b0ebcaa92f99c1c3cb0cd651594a43915 Mon Sep 17 00:00:00 2001 From: Thomas Tuttle Date: Thu, 5 Jun 2008 22:46:31 -0700 Subject: pagemap: fix bug in add_to_pagemap, require aligned-length reads of /proc/pid/pagemap Fix a bug in add_to_pagemap. Previously, since pm->out was a char *, put_user was only copying 1 byte of every PFN, resulting in the top 7 bytes of each PFN not being copied. By requiring that reads be a multiple of 8 bytes, I can make pm->out and pm->end u64*s instead of char*s, which makes put_user work properly, and also simplifies the logic in add_to_pagemap a bit. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Thomas Tuttle Cc: Matt Mackall Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 88717c0f941b..17403629e330 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -496,7 +496,7 @@ const struct file_operations proc_clear_refs_operations = { }; struct pagemapread { - char __user *out, *end; + u64 __user *out, *end; }; #define PM_ENTRY_BYTES sizeof(u64) @@ -519,21 +519,11 @@ struct pagemapread { static int add_to_pagemap(unsigned long addr, u64 pfn, struct pagemapread *pm) { - /* - * Make sure there's room in the buffer for an - * entire entry. Otherwise, only copy part of - * the pfn. - */ - if (pm->out + PM_ENTRY_BYTES >= pm->end) { - if (copy_to_user(pm->out, &pfn, pm->end - pm->out)) - return -EFAULT; - pm->out = pm->end; - return PM_END_OF_BUFFER; - } - if (put_user(pfn, pm->out)) return -EFAULT; - pm->out += PM_ENTRY_BYTES; + pm->out++; + if (pm->out >= pm->end) + return PM_END_OF_BUFFER; return 0; } @@ -634,7 +624,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, ret = -EINVAL; /* file position must be aligned */ - if (*ppos % PM_ENTRY_BYTES) + if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_task; ret = 0; @@ -664,8 +654,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_pages; } - pm.out = buf; - pm.end = buf + count; + pm.out = (u64 *)buf; + pm.end = (u64 *)(buf + count); if (!ptrace_may_attach(task)) { ret = -EIO; @@ -690,9 +680,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (ret == PM_END_OF_BUFFER) ret = 0; /* don't need mmap_sem for these, but this looks cleaner */ - *ppos += pm.out - buf; + *ppos += (char *)pm.out - buf; if (!ret) - ret = pm.out - buf; + ret = (char *)pm.out - buf; } out_pages: -- cgit v1.2.3 From aed5417593ad125283f35513573282139a8664b5 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 5 Jun 2008 22:46:53 -0700 Subject: proc: calculate the correct /proc/ link count This patch: commit e9720acd728a46cb40daa52c99a979f7c4ff195c Author: Pavel Emelyanov Date: Fri Mar 7 11:08:40 2008 -0800 [NET]: Make /proc/net a symlink on /proc/self/net (v3) introduced a /proc/self/net directory without bumping the corresponding link count for /proc/self. This patch replaces the static link count initializations with a call that counts the number of directory entries in the given pid_entry table whenever it is instantiated, and thus relieves the burden of manually keeping the two in sync. [akpm@linux-foundation.org: cleanup] Acked-by: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: Vegard Nossum Cc: "David S. Miller" Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index c447e0743a3c..3b455371e7ff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -127,6 +127,25 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = &proc_##OTYPE } ) +/* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. + */ +static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, + unsigned int n) +{ + unsigned int i; + unsigned int count; + + count = 0; + for (i = 0; i < n; ++i) { + if (S_ISDIR(entries[i].mode)) + ++count; + } + + return count; +} + int maps_protect; EXPORT_SYMBOL(maps_protect); @@ -2585,10 +2604,9 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 5; -#ifdef CONFIG_SECURITY - inode->i_nlink += 1; -#endif + + inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff)); dentry->d_op = &pid_dentry_operations; @@ -2816,10 +2834,9 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 4; -#ifdef CONFIG_SECURITY - inode->i_nlink += 1; -#endif + + inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff)); dentry->d_op = &pid_dentry_operations; -- cgit v1.2.3 From bbcdac0c20aa20d1daad41d9c138102b70e5aae4 Mon Sep 17 00:00:00 2001 From: Thomas Tuttle Date: Thu, 5 Jun 2008 22:46:58 -0700 Subject: pagemap: return map count, not reference count, in /proc/kpagecount Since pagemap is all about examining pages mapped into processes' memory spaces, it makes sense for kpagecount to return the map counts, not the reference counts. Signed-off-by: Thomas Tuttle Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 32dc14cd8900..5a16090a6d6e 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -726,7 +726,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, if (!ppage) pcount = 0; else - pcount = atomic_read(&ppage->_count); + pcount = page_mapcount(ppage); if (put_user(pcount, out++)) { ret = -EFAULT; -- cgit v1.2.3 From 4710d1ac4c491dd8a28f57946214c0b5fe73cc87 Mon Sep 17 00:00:00 2001 From: Thomas Tuttle Date: Thu, 5 Jun 2008 22:46:58 -0700 Subject: pagemap: return EINVAL, not EIO, for unaligned reads of kpagecount or kpageflags If the user tries to read from a position that is not a multiple of 8, or read a number of bytes that is not a multiple of 8, they have passed an invalid argument to read, for the purpose of reading these files. It's not an IO error because we didn't encounter any trouble finding the data they asked for. Signed-off-by: Thomas Tuttle Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 5a16090a6d6e..7e277f2ad466 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -716,7 +716,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, pfn = src / KPMSIZE; count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) - return -EIO; + return -EINVAL; while (count > 0) { ppage = NULL; @@ -782,7 +782,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, pfn = src / KPMSIZE; count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) - return -EIO; + return -EINVAL; while (count > 0) { ppage = NULL; -- cgit v1.2.3 From 2165009bdf63f79716a36ad545df14c3cdf958b7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 12 Jun 2008 15:21:47 -0700 Subject: pagemap: pass mm into pagewalkers We need this at least for huge page detection for now, because powerpc needs the vm_area_struct to be able to determine whether a virtual address is referring to a huge page (its pmd_huge() doesn't work). It might also come in handy for some of the other users. Signed-off-by: Dave Hansen Acked-by: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 44 +++++++++++++++++++++++++------------------- include/linux/mm.h | 17 +++++++++-------- mm/pagewalk.c | 42 ++++++++++++++++++++++-------------------- 3 files changed, 56 insertions(+), 47 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 17403629e330..f0df3109343d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -315,9 +315,9 @@ struct mem_size_stats { }; static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - void *private) + struct mm_walk *walk) { - struct mem_size_stats *mss = private; + struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; pte_t *pte, ptent; spinlock_t *ptl; @@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } -static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; - static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss; int ret; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; memset(&mss, 0, sizeof mss); mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, - &smaps_walk, &mss); + walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); ret = show_map(m, v); if (ret) @@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = { }; static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, void *private) + unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = private; + struct vm_area_struct *vma = walk->private; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + static struct mm_walk clear_refs_walk; + memset(&clear_refs_walk, 0, sizeof(clear_refs_walk)); + clear_refs_walk.pmd_entry = clear_refs_pte_range; + clear_refs_walk.mm = mm; down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for (vma = mm->mmap; vma; vma = vma->vm_next) { + clear_refs_walk.private = vma; if (!is_vm_hugetlb_page(vma)) - walk_page_range(mm, vma->vm_start, vma->vm_end, - &clear_refs_walk, vma); + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); + } flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -528,9 +534,9 @@ static int add_to_pagemap(unsigned long addr, u64 pfn, } static int pagemap_pte_hole(unsigned long start, unsigned long end, - void *private) + struct mm_walk *walk) { - struct pagemapread *pm = private; + struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; for (addr = start; addr < end; addr += PAGE_SIZE) { @@ -548,9 +554,9 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte) } static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - void *private) + struct mm_walk *walk) { - struct pagemapread *pm = private; + struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; @@ -675,8 +681,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, * user buffer is tracked in "pm", and the walk * will stop when we hit the end of the buffer. */ - ret = walk_page_range(mm, start_vaddr, end_vaddr, - &pagemap_walk, &pm); + ret = walk_page_range(start_vaddr, end_vaddr, + &pagemap_walk); if (ret == PM_END_OF_BUFFER) ret = 0; /* don't need mmap_sem for these, but this looks cleaner */ diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30e..586a943cab01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -760,16 +760,17 @@ unsigned long unmap_vmas(struct mmu_gather **tlb, * (see walk_page_range for more details) */ struct mm_walk { - int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *); - int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *); - int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *); - int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *); - int (*pte_hole)(unsigned long, unsigned long, void *); + int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); + struct mm_struct *mm; + void *private; }; -int walk_page_range(const struct mm_struct *, unsigned long addr, - unsigned long end, const struct mm_walk *walk, - void *private); +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk); void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 0afd2387e507..d5878bed7841 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,14 +3,14 @@ #include static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pte_t *pte; int err = 0; pte = pte_offset_map(pmd, addr); for (;;) { - err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; addr += PAGE_SIZE; @@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pmd_t *pmd; unsigned long next; @@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, private); + err = walk->pmd_entry(pmd, addr, next, walk); if (!err && walk->pte_entry) - err = walk_pte_range(pmd, addr, next, walk, private); + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); @@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, } static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pud_t *pud; unsigned long next; @@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pud_entry) - err = walk->pud_entry(pud, addr, next, private); + err = walk->pud_entry(pud, addr, next, walk); if (!err && (walk->pmd_entry || walk->pte_entry)) - err = walk_pmd_range(pud, addr, next, walk, private); + err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); @@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree - * @private: private data passed to the callback function * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, * etc.). If lower-level callbacks are omitted, walking depth is reduced. * - * Each callback receives an entry pointer, the start and end of the - * associated range, and a caller-supplied private data pointer. + * Each callback receives an entry pointer and the start and end of the + * associated range, and a copy of the original mm_walk for access to + * the ->private or ->mm fields. * * No locks are taken, but the bottom level iterator will map PTE * directories from highmem if necessary. @@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. */ -int walk_page_range(const struct mm_struct *mm, - unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) { pgd_t *pgd; unsigned long next; @@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm, if (addr >= end) return err; - pgd = pgd_offset(mm, addr); + if (!walk->mm) + return -EINVAL; + + pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, private); + err = walk->pgd_entry(pgd, addr, next, walk); if (!err && (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) - err = walk_pud_range(pgd, addr, next, walk, private); + err = walk_pud_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); -- cgit v1.2.3 From bcf8039ed45f56013c4afea5520bca7d909e5e61 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 12 Jun 2008 15:21:48 -0700 Subject: pagemap: fix large pages in pagemap We were walking right into huge page areas in the pagemap walker, and calling the pmds pmd_bad() and clearing them. That leaked huge pages. Bad. This patch at least works around that for now. It ignores huge pages in the pagemap walker for the time being, and won't leak those pages. Signed-off-by: Dave Hansen Acked-by: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0df3109343d..ab8ccc9d14ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -553,24 +553,45 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte) return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); } +static unsigned long pte_to_pagemap_entry(pte_t pte) +{ + unsigned long pme = 0; + if (is_swap_pte(pte)) + pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; + else if (pte_present(pte)) + pme = PM_PFRAME(pte_pfn(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} + static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { + struct vm_area_struct *vma; struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; + /* find the first VMA at or above 'addr' */ + vma = find_vma(walk->mm, addr); for (; addr != end; addr += PAGE_SIZE) { u64 pfn = PM_NOT_PRESENT; - pte = pte_offset_map(pmd, addr); - if (is_swap_pte(*pte)) - pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; - else if (pte_present(*pte)) - pfn = PM_PFRAME(pte_pfn(*pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - /* unmap so we're not in atomic when we copy to userspace */ - pte_unmap(pte); + + /* check to see if we've left 'vma' behind + * and need a new, higher one */ + if (vma && (addr >= vma->vm_end)) + vma = find_vma(walk->mm, addr); + + /* check that 'vma' actually covers this address, + * and that it isn't a huge page vma */ + if (vma && (vma->vm_start <= addr) && + !is_vm_hugetlb_page(vma)) { + pte = pte_offset_map(pmd, addr); + pfn = pte_to_pagemap_entry(*pte); + /* unmap before userspace copy */ + pte_unmap(pte); + } err = add_to_pagemap(addr, pfn, pm); if (err) return err; -- cgit v1.2.3