From 70f6cbb6f9c95535acd327d1ac1ce5fd078cff1e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Dec 2015 00:13:10 -0500 Subject: kernel/*: switch to memdup_user_nul() Signed-off-by: Al Viro --- kernel/sysctl.c | 79 ++++++++++++++++++++------------------------------------- 1 file changed, 28 insertions(+), 51 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..5faf89ac9ec0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2047,9 +2047,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, void *data) { int *i, vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2078,15 +2077,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first=0) { @@ -2094,11 +2087,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); if (!left) break; - err = proc_get_long(&kbuf, &left, &lval, &neg, + err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2125,10 +2118,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2310,9 +2302,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2340,15 +2331,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first = 0) { @@ -2357,9 +2342,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (write) { bool neg; - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); - err = proc_get_long(&kbuf, &left, &val, &neg, + err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2385,10 +2370,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2650,34 +2634,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - unsigned long page = 0; - char *kbuf; + char *kbuf, *p; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - free_page(page); - return -EFAULT; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { - free_page(page); + kfree(kbuf); return -ENOMEM; } - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; - err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); if (err) break; @@ -2688,12 +2665,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, val_b = val_a; if (left) { - kbuf++; + p++; left--; } if (c == '-') { - err = proc_get_long(&kbuf, &left, &val_b, + err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); if (err) @@ -2704,16 +2681,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, break; } if (left) { - kbuf++; + p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); } - free_page(page); + kfree(kbuf); } else { unsigned long bit_a, bit_b = 0; -- cgit v1.2.3 From d07e22597d1d355829b7b18ac19afa912cf758d1 Mon Sep 17 00:00:00 2001 From: Daniel Cashman Date: Thu, 14 Jan 2016 15:19:53 -0800 Subject: mm: mmap: add new /proc tunable for mmap_base ASLR Address Space Layout Randomization (ASLR) provides a barrier to exploitation of user-space processes in the presence of security vulnerabilities by making it more difficult to find desired code/data which could help an attack. This is done by adding a random offset to the location of regions in the process address space, with a greater range of potential offset values corresponding to better protection/a larger search-space for brute force, but also to greater potential for fragmentation. The offset added to the mmap_base address, which provides the basis for the majority of the mappings for a process, is set once on process exec in arch_pick_mmap_layout() and is done via hard-coded per-arch values, which reflect, hopefully, the best compromise for all systems. The trade-off between increased entropy in the offset value generation and the corresponding increased variability in address space fragmentation is not absolute, however, and some platforms may tolerate higher amounts of entropy. This patch introduces both new Kconfig values and a sysctl interface which may be used to change the amount of entropy used for offset generation on a system. The direct motivation for this change was in response to the libstagefright vulnerabilities that affected Android, specifically to information provided by Google's project zero at: http://googleprojectzero.blogspot.com/2015/09/stagefrightened.html The attack presented therein, by Google's project zero, specifically targeted the limited randomness used to generate the offset added to the mmap_base address in order to craft a brute-force-based attack. Concretely, the attack was against the mediaserver process, which was limited to respawning every 5 seconds, on an arm device. The hard-coded 8 bits used resulted in an average expected success rate of defeating the mmap ASLR after just over 10 minutes (128 tries at 5 seconds a piece). With this patch, and an accompanying increase in the entropy value to 16 bits, the same attack would take an average expected time of over 45 hours (32768 tries), which makes it both less feasible and more likely to be noticed. The introduced Kconfig and sysctl options are limited by per-arch minimum and maximum values, the minimum of which was chosen to match the current hard-coded value and the maximum of which was chosen so as to give the greatest flexibility without generating an invalid mmap_base address, generally a 3-4 bits less than the number of bits in the user-space accessible virtual address space. When decided whether or not to change the default value, a system developer should consider that mmap_base address could be placed anywhere up to 2^(value) bits away from the non-randomized location, which would introduce variable-sized areas above and below the mmap_base address such that the maximum vm_area_struct size may be reduced, preventing very large allocations. This patch (of 4): ASLR only uses as few as 8 bits to generate the random offset for the mmap base address on 32 bit architectures. This value was chosen to prevent a poorly chosen value from dividing the address space in such a way as to prevent large allocations. This may not be an issue on all platforms. Allow the specification of a minimum number of bits so that platforms desiring greater ASLR protection may determine where to place the trade-off. Signed-off-by: Daniel Cashman Cc: Russell King Acked-by: Kees Cook Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Don Zickus Cc: Eric W. Biederman Cc: Heinrich Schuchardt Cc: Josh Poimboeuf Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Thomas Gleixner Cc: David Rientjes Cc: Mark Salyzyn Cc: Jeff Vander Stoep Cc: Nick Kralevich Cc: Catalin Marinas Cc: Will Deacon Cc: "H. Peter Anvin" Cc: Hector Marco-Gisbert Cc: Borislav Petkov Cc: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 29 +++++++++++++++++++ arch/Kconfig | 68 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 11 ++++++++ kernel/sysctl.c | 22 +++++++++++++++ mm/mmap.c | 12 ++++++++ 5 files changed, 142 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f72370b440b1..ee763f3d3b52 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -42,6 +42,8 @@ Currently, these files are in /proc/sys/vm: - min_slab_ratio - min_unmapped_ratio - mmap_min_addr +- mmap_rnd_bits +- mmap_rnd_compat_bits - nr_hugepages - nr_overcommit_hugepages - nr_trim_pages (only if CONFIG_MMU=n) @@ -485,6 +487,33 @@ against future potential kernel bugs. ============================================================== +mmap_rnd_bits: + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations on architectures which support +tuning address space randomization. This value will be bounded +by the architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_bits tunable + +============================================================== + +mmap_rnd_compat_bits: + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations for applications run in +compatibility mode on architectures which support tuning address +space randomization. This value will be bounded by the +architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_compat_bits tunable + +============================================================== + nr_hugepages Change the minimum size of the hugepage pool. diff --git a/arch/Kconfig b/arch/Kconfig index 4e949e58b192..ba1b626bca00 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -511,6 +511,74 @@ config ARCH_HAS_ELF_RANDOMIZE - arch_mmap_rnd() - arch_randomize_brk() +config HAVE_ARCH_MMAP_RND_BITS + bool + help + An arch should select this symbol if it supports setting a variable + number of bits for use in establishing the base address for mmap + allocations, has MMU enabled and provides values for both: + - ARCH_MMAP_RND_BITS_MIN + - ARCH_MMAP_RND_BITS_MAX + +config ARCH_MMAP_RND_BITS_MIN + int + +config ARCH_MMAP_RND_BITS_MAX + int + +config ARCH_MMAP_RND_BITS_DEFAULT + int + +config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT + default ARCH_MMAP_RND_BITS_MIN + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to + determine the random offset to the base address of vma regions + resulting from mmap allocations. This value will be bounded + by the architecture's minimum and maximum supported values. + + This value can be changed after boot using the + /proc/sys/vm/mmap_rnd_bits tunable + +config HAVE_ARCH_MMAP_RND_COMPAT_BITS + bool + help + An arch should select this symbol if it supports running applications + in compatibility mode, supports setting a variable number of bits for + use in establishing the base address for mmap allocations, has MMU + enabled and provides values for both: + - ARCH_MMAP_RND_COMPAT_BITS_MIN + - ARCH_MMAP_RND_COMPAT_BITS_MAX + +config ARCH_MMAP_RND_COMPAT_BITS_MIN + int + +config ARCH_MMAP_RND_COMPAT_BITS_MAX + int + +config ARCH_MMAP_RND_COMPAT_BITS_DEFAULT + int + +config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT + default ARCH_MMAP_RND_COMPAT_BITS_MIN + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to + determine the random offset to the base address of vma regions + resulting from mmap allocations for compatible applications This + value will be bounded by the architecture's minimum and maximum + supported values. + + This value can be changed after boot using the + /proc/sys/vm/mmap_rnd_compat_bits tunable + config HAVE_COPY_THREAD_TLS bool help diff --git a/include/linux/mm.h b/include/linux/mm.h index a8ab1fc0e9bc..d396753c0577 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -51,6 +51,17 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +extern const int mmap_rnd_bits_min; +extern const int mmap_rnd_bits_max; +extern int mmap_rnd_bits __read_mostly; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +extern const int mmap_rnd_compat_bits_min; +extern const int mmap_rnd_compat_bits_max; +extern int mmap_rnd_compat_bits __read_mostly; +#endif + #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5faf89ac9ec0..c810f8afdb7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; diff --git a/mm/mmap.c b/mm/mmap.c index c311bfd8005b..f32b84ad621a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -58,6 +58,18 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -- cgit v1.2.3 From 759c01142a5d0f364a462346168a56de28a80f52 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 18 Jan 2016 16:36:09 +0100 Subject: pipe: limit the per-user amount of pages allocated in pipes On no-so-small systems, it is possible for a single process to cause an OOM condition by filling large pipes with data that are never read. A typical process filling 4000 pipes with 1 MB of data will use 4 GB of memory. On small systems it may be tricky to set the pipe max size to prevent this from happening. This patch makes it possible to enforce a per-user soft limit above which new pipes will be limited to a single page, effectively limiting them to 4 kB each, as well as a hard limit above which no new pipes may be created for this user. This has the effect of protecting the system against memory abuse without hurting other users, and still allowing pipes to work correctly though with less data at once. The limit are controlled by two new sysctls : pipe-user-pages-soft, and pipe-user-pages-hard. Both may be disabled by setting them to zero. The default soft limit allows the default number of FDs per process (1024) to create pipes of the default size (64kB), thus reaching a limit of 64MB before starting to create only smaller pipes. With 256 processes limited to 1024 FDs each, this results in 1024*64kB + (256*1024 - 1024) * 4kB = 1084 MB of memory allocated for a user. The hard limit is disabled by default to avoid breaking existing applications that make intensive use of pipes (eg: for splicing). Reported-by: socketpair@gmail.com Reported-by: Tetsuo Handa Mitigates: CVE-2013-4312 (Linux 2.0+) Suggested-by: Linus Torvalds Signed-off-by: Willy Tarreau Signed-off-by: Al Viro --- Documentation/sysctl/fs.txt | 23 ++++++++++++++++++++++ fs/pipe.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- include/linux/pipe_fs_i.h | 4 ++++ include/linux/sched.h | 1 + kernel/sysctl.c | 14 ++++++++++++++ 5 files changed, 87 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 88152f214f48..302b5ed616a6 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs: - nr_open - overflowuid - overflowgid +- pipe-user-pages-hard +- pipe-user-pages-soft - protected_hardlinks - protected_symlinks - suid_dumpable @@ -159,6 +161,27 @@ The default is 65534. ============================================================== +pipe-user-pages-hard: + +Maximum total number of pages a non-privileged user may allocate for pipes. +Once this limit is reached, no new pipes may be allocated until usage goes +below the limit again. When set to 0, no limit is applied, which is the default +setting. + +============================================================== + +pipe-user-pages-soft: + +Maximum total number of pages a non-privileged user may allocate for pipes +before the pipe size gets limited to a single page. Once this limit is reached, +new pipes will be limited to a single page in size for this user in order to +limit total memory usage, and trying to increase them using fcntl() will be +denied until usage goes below the limit again. The default value allows to +allocate up to 1024 pipes at their default size. When set to 0, no limit is +applied. + +============================================================== + protected_hardlinks: A long-standing class of security issues is the hardlink-based diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index eb8b8ac6df3c..24f5470d3944 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -42,6 +42,7 @@ struct pipe_buffer { * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers + * @user: the user who created this pipe **/ struct pipe_inode_info { struct mutex mutex; @@ -57,6 +58,7 @@ struct pipe_inode_info { struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; + struct user_struct *user; }; /* @@ -123,6 +125,8 @@ void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); extern unsigned int pipe_max_size, pipe_min_size; +extern unsigned long pipe_user_pages_hard; +extern unsigned long pipe_user_pages_soft; int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 61aa9bbea871..1589ddc88e38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct user_struct { #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ + atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c810f8afdb7f..f6fd236429bd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1757,6 +1757,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; -- cgit v1.2.3 From 41662f5cc55335807d39404371cfcbb1909304c4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Jan 2016 15:00:45 -0800 Subject: sysctl: enable strict writes SYSCTL_WRITES_WARN was added in commit f4aacea2f5d1 ("sysctl: allow for strict write position handling"), and released in v3.16 in August of 2014. Since then I can find only 1 instance of non-zero offset writing[1], and it was fixed immediately in CRIU[2]. As such, it appears safe to flip this to the strict state now. [1] https://www.google.com/search?q="when%20file%20position%20was%20not%200" [2] http://lists.openvz.org/pipermail/criu/2015-April/019819.html Signed-off-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 15 +++++++-------- kernel/sysctl.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 73c6b1ef0e84..a93b414672a7 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -825,14 +825,13 @@ via the /proc/sys interface: Each write syscall must fully contain the sysctl value to be written, and multiple writes on the same sysctl file descriptor will rewrite the sysctl value, regardless of file position. - 0 - (default) Same behavior as above, but warn about processes that - perform writes to a sysctl file descriptor when the file position - is not 0. - 1 - Respect file position when writing sysctl strings. Multiple writes - will append to the sysctl value buffer. Anything past the max length - of the sysctl value buffer will be ignored. Writes to numeric sysctl - entries must always be at file position 0 and the value must be - fully contained in the buffer sent in the write syscall. + 0 - Same behavior as above, but warn about processes that perform writes + to a sysctl file descriptor when the file position is not 0. + 1 - (default) Respect file position when writing sysctl strings. Multiple + writes will append to the sysctl value buffer. Anything past the max + length of the sysctl value buffer will be ignored. Writes to numeric + sysctl entries must always be at file position 0 and the value must + be fully contained in the buffer sent in the write syscall. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c810f8afdb7f..91420362e0b3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,7 +173,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3