From 502717f4e112b18d9c37753a32f675bec9f2838b Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 11 Oct 2006 01:20:46 -0700 Subject: [PATCH] hugetlb: fix linked list corruption in unmap_hugepage_range() commit fe1668ae5bf0145014c71797febd9ad5670d5d05 causes kernel to oops with libhugetlbfs test suite. The problem is that hugetlb pages can be shared by multiple mappings. Multiple threads can fight over page->lru in the unmap path and bad things happen. We now serialize __unmap_hugepage_range to void concurrent linked list manipulation. Such serialization is also needed for shared page table page on hugetlb area. This patch will fixed the bug and also serve as a prepatch for shared page table. Signed-off-by: Ken Chen Cc: Hugh Dickins Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5e03b2f67b93..4ee3f006b861 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) if (h_vm_pgoff >= h_pgoff) v_offset = 0; - unmap_hugepage_range(vma, + __unmap_hugepage_range(vma, vma->vm_start + v_offset, vma->vm_end); } } -- cgit v1.2.3 From b9d7e6ae82da124dc9c579fe1061264ef2a69407 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 28 Oct 2006 10:38:41 -0700 Subject: [PATCH] hugetlb: fix size=4G parsing On 32-bit machines, mount -t hugetlbfs -o size=4G gave a 0GB filesystem, size=5G gave a 1GB filesystem etc: there's no point in masking size with HPAGE_MASK just before shifting its lower bits away, and since HPAGE_MASK is a UL, that removed all the higher bits of the unsigned long long size. Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: David Gibson Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4ee3f006b861..0b23b963bb44 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -624,7 +624,6 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) do_div(size, 100); rest++; } - size &= HPAGE_MASK; pconfig->nr_blocks = (size >> HPAGE_SHIFT); value = rest; } else if (!strcmp(opt,"nr_inodes")) { -- cgit v1.2.3 From 856fc29505556cf263f3dcda2533cf3766c14ab6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 28 Oct 2006 10:38:43 -0700 Subject: [PATCH] hugetlb: fix prio_tree unit hugetlb_vmtruncate_list was misconverted to prio_tree: its prio_tree is in units of PAGE_SIZE (PAGE_CACHE_SIZE) like any other, not HPAGE_SIZE (whereas its radix_tree is kept in units of HPAGE_SIZE, otherwise slots would be absurdly sparse). At first I thought the error benign, just calling __unmap_hugepage_range on more vmas than necessary; but on 32-bit machines, when the prio_tree is searched correctly, it happens to ensure the v_offset calculation won't overflow. As it stood, when truncating at or beyond 4GB, it was liable to discard pages COWed from lower offsets; or even to clear pmd entries of preceding vmas, triggering exit_mmap's BUG_ON(nr_ptes). Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: David Gibson Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0b23b963bb44..0bea6a619e10 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -271,26 +271,24 @@ static void hugetlbfs_drop_inode(struct inode *inode) hugetlbfs_forget_inode(inode); } -/* - * h_pgoff is in HPAGE_SIZE units. - * vma->vm_pgoff is in PAGE_SIZE units. - */ static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) +hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { - unsigned long h_vm_pgoff; + vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { unsigned long v_offset; - h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); - v_offset = (h_pgoff - h_vm_pgoff) << HPAGE_SHIFT; /* - * Is this VMA fully outside the truncation point? + * Can the expression below overflow on 32-bit arches? + * No, because the prio_tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. */ - if (h_vm_pgoff >= h_pgoff) + if (vma->vm_pgoff < pgoff) + v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + else v_offset = 0; __unmap_hugepage_range(vma, @@ -303,14 +301,14 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) */ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { - unsigned long pgoff; + pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; if (offset > inode->i_size) return -EINVAL; BUG_ON(offset & ~HPAGE_MASK); - pgoff = offset >> HPAGE_SHIFT; + pgoff = offset >> PAGE_SHIFT; inode->i_size = offset; spin_lock(&mapping->i_mmap_lock); -- cgit v1.2.3