From f9fe48bece3af2d60e1bad65db4825f5a025dd36 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:40 -0800 Subject: dax: support dirty DAX entries in radix tree Add support for tracking dirty DAX entries in the struct address_space radix tree. This tree is already used for dirty page writeback, and it already supports the use of exceptional (non struct page*) entries. In order to properly track dirty DAX pages we will insert new exceptional entries into the radix tree that represent dirty DAX PTE or PMD pages. These exceptional entries will also contain the writeback addresses for the PTE or PMD faults that we can use at fsync/msync time. There are currently two types of exceptional entries (shmem and shadow) that can be placed into the radix tree, and this adds a third. We rely on the fact that only one type of exceptional entry can be found in a given radix tree based on its usage. This happens for free with DAX vs shmem but we explicitly prevent shadow entries from being added to radix trees for DAX mappings. The only shadow entries that would be generated for DAX radix trees would be to track zero page mappings that were created for holes. These pages would receive minimal benefit from having shadow entries, and the choice to have only one type of exceptional entry in a given radix tree makes the logic simpler both in clear_exceptional_entry() and in the rest of DAX. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/dax.h') diff --git a/include/linux/dax.h b/include/linux/dax.h index b415e521528d..e9d57f680f50 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -36,4 +36,9 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } + +static inline bool dax_mapping(struct address_space *mapping) +{ + return mapping->host && IS_DAX(mapping->host); +} #endif -- cgit v1.2.3 From 9973c98ecfda3a1dfcab981665b5f1e39bcde64a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:47 -0800 Subject: dax: add support for fsync/sync To properly handle fsync/msync in an efficient way DAX needs to track dirty pages so it is able to flush them durably to media on demand. The tracking of dirty pages is done via the radix tree in struct address_space. This radix tree is already used by the page writeback infrastructure for tracking dirty pages associated with an open file, and it already has support for exceptional (non struct page*) entries. We build upon these features to add exceptional entries to the radix tree for DAX dirty PMD or PTE pages at fault time. [dan.j.williams@intel.com: fix dax_pmd_dbg build warning] Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/dax.h | 2 + mm/filemap.c | 6 ++ 3 files changed, 266 insertions(+), 16 deletions(-) (limited to 'include/linux/dax.h') diff --git a/fs/dax.c b/fs/dax.c index 5b84a46201c2..d5f6aca5a4d7 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -324,6 +325,199 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } +#define NO_SECTOR -1 +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) + +static int dax_radix_entry(struct address_space *mapping, pgoff_t index, + sector_t sector, bool pmd_entry, bool dirty) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + pgoff_t pmd_index = DAX_PMD_INDEX(index); + int type, error = 0; + void *entry; + + WARN_ON_ONCE(pmd_entry && !dirty); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + spin_lock_irq(&mapping->tree_lock); + + entry = radix_tree_lookup(page_tree, pmd_index); + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { + index = pmd_index; + goto dirty; + } + + entry = radix_tree_lookup(page_tree, index); + if (entry) { + type = RADIX_DAX_TYPE(entry); + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && + type != RADIX_DAX_PMD)) { + error = -EIO; + goto unlock; + } + + if (!pmd_entry || type == RADIX_DAX_PMD) + goto dirty; + + /* + * We only insert dirty PMD entries into the radix tree. This + * means we don't need to worry about removing a dirty PTE + * entry and inserting a clean PMD entry, thus reducing the + * range we would flush with a follow-up fsync/msync call. + */ + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + } + + if (sector == NO_SECTOR) { + /* + * This can happen during correct operation if our pfn_mkwrite + * fault raced against a hole punch operation. If this + * happens the pte that was hole punched will have been + * unmapped and the radix tree entry will have been removed by + * the time we are called, but the call will still happen. We + * will return all the way up to wp_pfn_shared(), where the + * pte_same() check will fail, eventually causing page fault + * to be retried by the CPU. + */ + goto unlock; + } + + error = radix_tree_insert(page_tree, index, + RADIX_DAX_ENTRY(sector, pmd_entry)); + if (error) + goto unlock; + + mapping->nrexceptional++; + dirty: + if (dirty) + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + unlock: + spin_unlock_irq(&mapping->tree_lock); + return error; +} + +static int dax_writeback_one(struct block_device *bdev, + struct address_space *mapping, pgoff_t index, void *entry) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + int type = RADIX_DAX_TYPE(entry); + struct radix_tree_node *node; + struct blk_dax_ctl dax; + void **slot; + int ret = 0; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + + /* another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto unlock; + + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + ret = -EIO; + goto unlock; + } + + dax.sector = RADIX_DAX_SECTOR(entry); + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + spin_unlock_irq(&mapping->tree_lock); + + /* + * We cannot hold tree_lock while calling dax_map_atomic() because it + * eventually calls cond_resched(). + */ + ret = dax_map_atomic(bdev, &dax); + if (ret < 0) + return ret; + + if (WARN_ON_ONCE(ret < dax.size)) { + ret = -EIO; + goto unmap; + } + + wb_cache_pmem(dax.addr, dax.size); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + unmap: + dax_unmap_atomic(bdev, &dax); + return ret; + + unlock: + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Flush the mapping to the persistent domain within the byte range of [start, + * end]. This is required by data integrity operations to ensure file data is + * on persistent storage prior to completion of the operation. + */ +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + struct inode *inode = mapping->host; + struct block_device *bdev = inode->i_sb->s_bdev; + pgoff_t start_index, end_index, pmd_index; + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + bool done = false; + int i, ret = 0; + void *entry; + + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) + return -EIO; + + start_index = start >> PAGE_CACHE_SHIFT; + end_index = end >> PAGE_CACHE_SHIFT; + pmd_index = DAX_PMD_INDEX(start_index); + + rcu_read_lock(); + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); + rcu_read_unlock(); + + /* see if the start of our range is covered by a PMD entry */ + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) + start_index = pmd_index; + + tag_pages_for_writeback(mapping, start_index, end_index); + + pagevec_init(&pvec, 0); + while (!done) { + pvec.nr = find_get_entries_tag(mapping, start_index, + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, + pvec.pages, indices); + + if (pvec.nr == 0) + break; + + for (i = 0; i < pvec.nr; i++) { + if (indices[i] > end_index) { + done = true; + break; + } + + ret = dax_writeback_one(bdev, mapping, indices[i], + pvec.pages[i]); + if (ret < 0) + return ret; + } + } + wmb_pmem(); + return 0; +} +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -363,6 +557,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, + vmf->flags & FAULT_FLAG_WRITE); + if (error) + goto out; + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: @@ -487,6 +686,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, delete_from_page_cache(page); unlock_page(page); page_cache_release(page); + page = NULL; } /* @@ -589,9 +789,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; - loff_t lstart, lend; sector_t block; - int result = 0; + int error, result = 0; + bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) @@ -629,10 +829,17 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - if (get_block(inode, block, &bh, write) != 0) + + if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; + + if (!buffer_mapped(&bh) && write) { + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + alloc = true; + } + bdev = bh.b_bdev; - i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -641,15 +848,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); - goto fallback; + return VM_FAULT_FALLBACK; + } + + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (alloc) { + loff_t lstart = pgoff << PAGE_SHIFT; + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ + + truncate_pagecache_range(inode, lstart, lend); } - /* make sure no process has any zero pages covering this hole */ - lstart = pgoff << PAGE_SHIFT; - lend = lstart + PMD_SIZE - 1; /* inclusive */ - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, lstart, PMD_SIZE, 0); - truncate_inode_pages_range(mapping, lstart, lend); i_mmap_lock_read(mapping); /* @@ -733,6 +945,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } dax_unmap_atomic(bdev, &dax); + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_pfn_mkwrite() path. This sequence + * allows the dax_pfn_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_pfn_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + error = dax_radix_entry(mapping, pgoff, dax.sector, + true, true); + if (error) { + dax_pmd_dbg(&bh, address, + "PMD radix insertion failed"); + goto fallback; + } + } + dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, @@ -791,15 +1028,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault - * */ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct file *file = vma->vm_file; - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - sb_end_pagefault(sb); + /* + * We pass NO_SECTOR to dax_radix_entry() because we expect that a + * RADIX_DAX_PTE entry already exists in the radix tree from a + * previous call to __dax_fault(). We just want to look up that PTE + * entry using vmf->pgoff and make sure the dirty tag is set. This + * saves us from having to make a call to get_block() here to look + * up the sector. + */ + dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); diff --git a/include/linux/dax.h b/include/linux/dax.h index e9d57f680f50..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end); #endif diff --git a/mm/filemap.c b/mm/filemap.c index 1e215fc36c83..2e7c8d980d5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; + if (dax_mapping(mapping) && mapping->nrexceptional) { + err = dax_writeback_mapping_range(mapping, lstart, lend); + if (err) + return err; + } + if (mapping->nrpages) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); -- cgit v1.2.3 From d1a5f2b4d8a125943dcb6b032fc7eaefc2c78296 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:25:31 -0800 Subject: block: use DAX for partition table reads Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Andrew Morton Reported-by: Ross Zwisler Tested-by: Ross Zwisler Reviewed-by: Matthew Wilcox Signed-off-by: Dan Williams --- block/partition-generic.c | 18 +++++++++++++++--- fs/dax.c | 20 ++++++++++++++++++++ include/linux/dax.h | 11 +++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'include/linux/dax.h') diff --git a/block/partition-generic.c b/block/partition-generic.c index 746935a5973c..fefd01b496a0 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "partitions/check.h" @@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; + + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; diff --git a/fs/dax.c b/fs/dax.c index 4fd6b0c5c6b5..e0e9358baf35 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -58,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = alloc_pages(GFP_KERNEL, 0); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} + /* * dax_clear_blocks() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS diff --git a/include/linux/dax.h b/include/linux/dax.h index 8204c3dc3800..818e45078929 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); + +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t, dax_iodone_t); -- cgit v1.2.3