From a1d776ee3147cec2a54a645e92eb2e3e2f65a137 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 21 Mar 2012 16:34:12 -0700 Subject: hugetlb: cleanup hugetlb.h Make a couple of small cleanups to linux/include/hugetlb.h. The set_file_hugepages() function, which was not used anywhere is removed, and the hugetlbfs_config and hugetlbfs_inode_info structures with its HUGETLBFS_I helper function are moved into inode.c, the only place they were used. These structures are really linked to the hugetlbfs filesystem specifically not to hugepage mm handling in general, so they belong in the filesystem code not in a generally available header. It would be nice to move the hugetlbfs_sb_info (superblock) structure in there as well, but it's currently needed in a number of places via the hstate_vma() and hstate_inode(). Signed-off-by: David Gibson Cc: Hugh Dickins Cc: Paul Mackerras Cc: Andrew Barry Cc: Mel Gorman Cc: Minchan Kim Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d9d6c868b86b..7adc4923e7ac 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -128,15 +128,6 @@ enum { }; #ifdef CONFIG_HUGETLBFS -struct hugetlbfs_config { - uid_t uid; - gid_t gid; - umode_t mode; - long nr_blocks; - long nr_inodes; - struct hstate *hstate; -}; - struct hugetlbfs_sb_info { long max_blocks; /* blocks allowed */ long free_blocks; /* blocks free */ @@ -146,17 +137,6 @@ struct hugetlbfs_sb_info { struct hstate *hstate; }; - -struct hugetlbfs_inode_info { - struct shared_policy policy; - struct inode vfs_inode; -}; - -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -{ - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); -} - static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; @@ -179,14 +159,9 @@ static inline int is_file_hugepages(struct file *file) return 0; } -static inline void set_file_hugepages(struct file *file) -{ - file->f_op = &hugetlbfs_file_operations; -} #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 -#define set_file_hugepages(file) BUG() static inline struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags) { -- cgit v1.2.3 From 90481622d75715bfcb68501280a917dbfe516029 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 21 Mar 2012 16:34:12 -0700 Subject: hugepages: fix use after free bug in "quota" handling hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry Signed-off-by: David Gibson Cc: Hugh Dickins Cc: Mel Gorman Cc: Minchan Kim Cc: Hillf Danton Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 54 ++++++++----------- include/linux/hugetlb.h | 14 +++-- mm/hugetlb.c | 135 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 139 insertions(+), 64 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4fbd9fccd550..7913e3252167 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 for max/free/used * blocks, like simple_statfs() */ - if (sbinfo->max_blocks >= 0) { - buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + if (sbinfo->spool) { + long free_pages; + + spin_lock(&sbinfo->spool->lock); + buf->f_blocks = sbinfo->spool->max_hpages; + free_pages = sbinfo->spool->max_hpages + - sbinfo->spool->used_hpages; + buf->f_bavail = buf->f_bfree = free_pages; + spin_unlock(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } @@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb) if (sbi) { sb->s_fs_info = NULL; + + if (sbi->spool) + hugepage_put_subpool(sbi->spool); + kfree(sbi); } } @@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbinfo; sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); - sbinfo->max_blocks = config.nr_blocks; - sbinfo->free_blocks = config.nr_blocks; sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; + sbinfo->spool = NULL; + if (config.nr_blocks != -1) { + sbinfo->spool = hugepage_new_subpool(config.nr_blocks); + if (!sbinfo->spool) + goto out_free; + } sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = huge_page_size(config.hstate); sb->s_blocksize_bits = huge_page_shift(config.hstate); @@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = root; return 0; out_free: + if (sbinfo->spool) + kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } -int hugetlb_get_quota(struct address_space *mapping, long delta) -{ - int ret = 0; - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); - - if (sbinfo->free_blocks > -1) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks - delta >= 0) - sbinfo->free_blocks -= delta; - else - ret = -ENOMEM; - spin_unlock(&sbinfo->stat_lock); - } - - return ret; -} - -void hugetlb_put_quota(struct address_space *mapping, long delta) -{ - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); - - if (sbinfo->free_blocks > -1) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_blocks += delta; - spin_unlock(&sbinfo->stat_lock); - } -} - static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7adc4923e7ac..cf0181738c9e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -14,6 +14,15 @@ struct user_struct; #include #include +struct hugepage_subpool { + spinlock_t lock; + long count; + long max_hpages, used_hpages; +}; + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); +void hugepage_put_subpool(struct hugepage_subpool *spool); + int PageHuge(struct page *page); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); @@ -129,12 +138,11 @@ enum { #ifdef CONFIG_HUGETLBFS struct hugetlbfs_sb_info { - long max_blocks; /* blocks allowed */ - long free_blocks; /* blocks free */ long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; + struct hugepage_subpool *spool; }; static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) @@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, struct user_struct **user, int creat_flags); -int hugetlb_get_quota(struct address_space *mapping, long delta); -void hugetlb_put_quota(struct address_space *mapping, long delta); static inline int is_file_hugepages(struct file *file) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1c314877334..afa057a1d3fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; */ static DEFINE_SPINLOCK(hugetlb_lock); +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +{ + bool free = (spool->count == 0) && (spool->used_hpages == 0); + + spin_unlock(&spool->lock); + + /* If no pages are used, and no other handles to the subpool + * remain, free the subpool the subpool remain */ + if (free) + kfree(spool); +} + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +{ + struct hugepage_subpool *spool; + + spool = kmalloc(sizeof(*spool), GFP_KERNEL); + if (!spool) + return NULL; + + spin_lock_init(&spool->lock); + spool->count = 1; + spool->max_hpages = nr_blocks; + spool->used_hpages = 0; + + return spool; +} + +void hugepage_put_subpool(struct hugepage_subpool *spool) +{ + spin_lock(&spool->lock); + BUG_ON(!spool->count); + spool->count--; + unlock_or_release_subpool(spool); +} + +static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, + long delta) +{ + int ret = 0; + + if (!spool) + return 0; + + spin_lock(&spool->lock); + if ((spool->used_hpages + delta) <= spool->max_hpages) { + spool->used_hpages += delta; + } else { + ret = -ENOMEM; + } + spin_unlock(&spool->lock); + + return ret; +} + +static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, + long delta) +{ + if (!spool) + return; + + spin_lock(&spool->lock); + spool->used_hpages -= delta; + /* If hugetlbfs_put_super couldn't free spool due to + * an outstanding quota reference, free it now. */ + unlock_or_release_subpool(spool); +} + +static inline struct hugepage_subpool *subpool_inode(struct inode *inode) +{ + return HUGETLBFS_SB(inode->i_sb)->spool; +} + +static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) +{ + return subpool_inode(vma->vm_file->f_dentry->d_inode); +} + /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. @@ -540,9 +618,9 @@ static void free_huge_page(struct page *page) */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct address_space *mapping; + struct hugepage_subpool *spool = + (struct hugepage_subpool *)page_private(page); - mapping = (struct address_space *) page_private(page); set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); @@ -558,8 +636,7 @@ static void free_huge_page(struct page *page) enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - if (mapping) - hugetlb_put_quota(mapping, 1); + hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, /* * Determine if the huge page at addr within the vma has an associated * reservation. Where it does not we will need to logically increase - * reservation and actually increase quota before an allocation can occur. - * Where any new reservation would be required the reservation change is - * prepared, but not committed. Once the page has been quota'd allocated - * an instantiated the change should be committed via vma_commit_reservation. - * No action is required on failure. + * reservation and actually increase subpool usage before an allocation + * can occur. Where any new reservation would be required the + * reservation change is prepared, but not committed. Once the page + * has been allocated from the subpool and instantiated the change should + * be committed via vma_commit_reservation. No action is required on + * failure. */ static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) @@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { + struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; long chg; /* - * Processes that did not create the mapping will have no reserves and - * will not have accounted against quota. Check that the quota can be - * made before satisfying the allocation - * MAP_NORESERVE mappings may also need pages and quota allocated - * if no reserve mapping overlaps. + * Processes that did not create the mapping will have no + * reserves and will not have accounted against subpool + * limit. Check that the subpool limit can be made before + * satisfying the allocation MAP_NORESERVE mappings may also + * need pages and subpool limit allocated allocated if no reserve + * mapping overlaps. */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-VM_FAULT_OOM); if (chg) - if (hugetlb_get_quota(inode->i_mapping, chg)) + if (hugepage_subpool_get_pages(spool, chg)) return ERR_PTR(-VM_FAULT_SIGBUS); spin_lock(&hugetlb_lock); @@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { - hugetlb_put_quota(inode->i_mapping, chg); + hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_private(page, (unsigned long) mapping); + set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); @@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *reservations = vma_resv_map(vma); + struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; @@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) if (reserve) { hugetlb_acct_memory(h, -reserve); - hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + hugepage_subpool_put_pages(spool, reserve); } } } @@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ address = address & huge_page_mask(h); pgoff = vma_hugecache_offset(h, vma, address); - mapping = (struct address_space *)page_private(page); + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* * Take the mapping lock for the duration of the table walk. As @@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode, { long ret, chg; struct hstate *h = hstate_inode(inode); + struct hugepage_subpool *spool = subpool_inode(inode); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page - * and filesystem quota without using reserves + * without using reserves */ if (vm_flags & VM_NORESERVE) return 0; @@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode, if (chg < 0) return chg; - /* There must be enough filesystem quota for the mapping */ - if (hugetlb_get_quota(inode->i_mapping, chg)) + /* There must be enough pages in the subpool for the mapping */ + if (hugepage_subpool_get_pages(spool, chg)) return -ENOSPC; /* * Check enough hugepages are available for the reservation. - * Hand back the quota if there are not + * Hand the pages back to the subpool if there are not */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { - hugetlb_put_quota(inode->i_mapping, chg); + hugepage_subpool_put_pages(spool, chg); return ret; } @@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct hugepage_subpool *spool = subpool_inode(inode); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugetlb_put_quota(inode->i_mapping, (chg - freed)); + hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } -- cgit v1.2.3 From 40716e29243de46720e5773797791466c28904ec Mon Sep 17 00:00:00 2001 From: Steven Truelove Date: Wed, 21 Mar 2012 16:34:14 -0700 Subject: hugetlbfs: fix alignment of huge page requests When calling shmget() with SHM_HUGETLB, shmget aligns the request size to PAGE_SIZE, but this is not sufficient. Modify hugetlb_file_setup() to align requests to the huge page size, and to accept an address argument so that all alignment checks can be performed in hugetlb_file_setup(), rather than in its callers. Change newseg() and mmap_pgoff() to match the new prototype and eliminate a now redundant alignment check. [akpm@linux-foundation.org: fix build] Signed-off-by: Steven Truelove Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 14 +++++++++----- include/linux/hugetlb.h | 6 ++++-- ipc/shm.c | 2 +- mm/mmap.c | 6 +++--- 4 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 79408159a001..631329f3de63 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -935,8 +935,8 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, +struct file *hugetlb_file_setup(const char *name, unsigned long addr, + size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags) { int error = -ENOMEM; @@ -945,6 +945,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, struct path path; struct dentry *root; struct qstr quick_string; + struct hstate *hstate; + unsigned long num_pages; *user = NULL; if (!hugetlbfs_vfsmount) @@ -978,10 +980,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size, if (!inode) goto out_dentry; + hstate = hstate_inode(inode); + size += addr & ~huge_page_mask(hstate); + num_pages = ALIGN(size, huge_page_size(hstate)) >> + huge_page_shift(hstate); error = -ENOMEM; - if (hugetlb_reserve_pages(inode, 0, - size >> huge_page_shift(hstate_inode(inode)), NULL, - acctflag)) + if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) goto out_inode; d_instantiate(path.dentry, inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cf0181738c9e..000837e126e6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,8 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, +struct file *hugetlb_file_setup(const char *name, unsigned long addr, + size_t size, vm_flags_t acct, struct user_struct **user, int creat_flags); static inline int is_file_hugepages(struct file *file) @@ -168,7 +169,8 @@ static inline int is_file_hugepages(struct file *file) #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 -static inline struct file *hugetlb_file_setup(const char *name, size_t size, +static inline struct file * +hugetlb_file_setup(const char *name, unsigned long addr, size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags) { return ERR_PTR(-ENOSYS); diff --git a/ipc/shm.c b/ipc/shm.c index b76be5bda6c2..406c5b208193 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -482,7 +482,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) /* hugetlb_file_setup applies strict accounting */ if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; - file = hugetlb_file_setup(name, size, acctflag, + file = hugetlb_file_setup(name, 0, size, acctflag, &shp->mlock_user, HUGETLB_SHMFS_INODE); } else { /* diff --git a/mm/mmap.c b/mm/mmap.c index 9e0c0de2e7e3..a19cc271e794 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1099,9 +1099,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, + VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE); if (IS_ERR(file)) return PTR_ERR(file); } -- cgit v1.2.3