diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2015-02-03 17:44:37 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2015-02-03 17:44:37 +1100 |
commit | 0519178514c69bd15a14b8a81002b16db5f71dad (patch) | |
tree | c516fd3148e9aa323ffbd5a7545b550e28a45320 | |
parent | 68a9462de925abe75a9e347d5af9d12a60a314a1 (diff) | |
parent | 97054281cad8234429c88eb82ecc22eaaf6d0ef5 (diff) |
Merge branch 'akpm/master'
60 files changed, 1786 insertions, 921 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf5..3187a18af6da 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index ac28149aede4..9922939e7d99 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -34,6 +34,9 @@ configfs/ - directory containing configfs documentation and example code. cramfs.txt - info on the cram filesystem for small storage (ROMs etc). +dax.txt + - info on avoiding the page cache for files stored on CPU-addressable + storage devices. debugfs.txt - info on the debugfs filesystem. devpts.txt @@ -154,5 +157,3 @@ xfs-self-describing-metadata.txt - info on XFS Self Describing Metadata. xfs.txt - info and mount options for the XFS filesystem. -xip.txt - - info on execute-in-place for file mappings. diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b30753cbf431..2ca3d17eee56 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -199,8 +199,6 @@ prototypes: int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); - int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, - unsigned long *); int (*migratepage)(struct address_space *, struct page *, struct page *); int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); @@ -225,7 +223,6 @@ invalidatepage: yes releasepage: yes freepage: yes direct_IO: -get_xip_mem: maybe migratepage: yes (both) launder_page: yes is_partially_uptodate: yes diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt new file mode 100644 index 000000000000..be376d91d058 --- /dev/null +++ b/Documentation/filesystems/dax.txt @@ -0,0 +1,91 @@ +Direct Access for files +----------------------- + +Motivation +---------- + +The page cache is usually used to buffer reads and writes to files. +It is also used to provide the pages which are mapped into userspace +by a call to mmap. + +For block devices that are memory-like, the page cache pages would be +unnecessary copies of the original storage. The DAX code removes the +extra copy by performing reads and writes directly to the storage device. +For file mappings, the storage device is mapped directly into userspace. + + +Usage +----- + +If you have a block device which supports DAX, you can make a filesystem +on it as usual. When mounting it, use the -o dax option manually +or add 'dax' to the options in /etc/fstab. + + +Implementation Tips for Block Driver Writers +-------------------------------------------- + +To support DAX in your block driver, implement the 'direct_access' +block device operation. It is used to translate the sector number +(expressed in units of 512-byte sectors) to a page frame number (pfn) +that identifies the physical page for the memory. It also returns a +kernel virtual address that can be used to access the memory. + +The direct_access method takes a 'size' parameter that indicates the +number of bytes being requested. The function should return the number +of bytes that can be contiguously accessed at that offset. It may also +return a negative errno if an error occurs. + +In order to support this method, the storage must be byte-accessible by +the CPU at all times. If your device uses paging techniques to expose +a large amount of memory through a smaller window, then you cannot +implement direct_access. Equally, if your device can occasionally +stall the CPU for an extended period, you should also not attempt to +implement direct_access. + +These block devices may be used for inspiration: +- axonram: Axon DDR2 device driver +- brd: RAM backed block device driver +- dcssblk: s390 dcss block device driver + + +Implementation Tips for Filesystem Writers +------------------------------------------ + +Filesystem support consists of +- adding support to mark inodes as being DAX by setting the S_DAX flag in + i_flags +- implementing the direct_IO address space operation, and calling + dax_do_io() instead of blockdev_direct_IO() if S_DAX is set +- implementing an mmap file operation for DAX files which sets the + VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers + for fault and page_mkwrite (which should probably call dax_fault() and + dax_mkwrite(), passing the appropriate get_block() callback) +- calling dax_truncate_page() instead of block_truncate_page() for DAX files +- calling dax_zero_page_range() instead of zero_user() for DAX files +- ensuring that there is sufficient locking between reads, writes, + truncates and page faults + +The get_block() callback passed to the DAX functions may return +uninitialised extents. If it does, it must ensure that simultaneous +calls to get_block() (for example by a page-fault racing with a read() +or a write()) work correctly. + +These filesystems may be used for inspiration: +- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt +- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt + + +Shortcomings +------------ + +Even if the kernel or its modules are stored on a filesystem that supports +DAX on a block device that supports DAX, they will still be copied into RAM. + +Calling get_user_pages() on a range of user memory that has been mmaped +from a DAX file will fail as there are no 'struct page' to describe +those pages. This problem is being worked on. That means that O_DIRECT +reads/writes to those memory ranges from a non-DAX file will fail (note +that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory +that is being accessed that is key here). Other things that will not +work include RDMA, sendfile() and splice(). diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt index 67639f905f10..b9714569e472 100644 --- a/Documentation/filesystems/ext2.txt +++ b/Documentation/filesystems/ext2.txt @@ -20,6 +20,9 @@ minixdf Makes `df' act like Minix. check=none, nocheck (*) Don't do extra checking of bitmaps on mount (check=normal and check=strict options removed) +dax Use direct access (no page cache). See + Documentation/filesystems/dax.txt. + debug Extra debugging information is sent to the kernel syslog. Useful for developers. @@ -56,8 +59,6 @@ noacl Don't support POSIX ACLs. nobh Do not attach buffer_heads to file pagecache. -xip Use execute in place (no caching) if possible - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 919a3293aaa4..6c0108eb0137 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -386,6 +386,10 @@ max_dir_size_kb=n This limits the size of directories so that any i_version Enable 64-bit inode version support. This option is off by default. +dax Use direct access (no page cache). See + Documentation/filesystems/dax.txt. Note that + this option is incompatible with data=journal. + Data Mode ========= There are 3 different data modes: diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 43ce0507ee25..966b22829f3b 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -591,8 +591,6 @@ struct address_space_operations { int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); - struct page* (*get_xip_page)(struct address_space *, sector_t, - int); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); @@ -748,11 +746,6 @@ struct address_space_operations { and transfer data directly between the storage and the application's address space. - get_xip_page: called by the VM to translate a block number to a page. - The page is valid until the corresponding filesystem is unmounted. - Filesystems that want to use execute-in-place (XIP) need to implement - it. An example implementation can be found in fs/ext2/xip.c. - migrate_page: This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is signalling imminent failure) it will pass a new page diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt deleted file mode 100644 index b77472949ede..000000000000 --- a/Documentation/filesystems/xip.txt +++ /dev/null @@ -1,71 +0,0 @@ -Execute-in-place for file mappings ----------------------------------- - -Motivation ----------- -File mappings are performed by mapping page cache pages to userspace. In -addition, read&write type file operations also transfer data from/to the page -cache. - -For memory backed storage devices that use the block device interface, the page -cache pages are in fact copies of the original storage. Various approaches -exist to work around the need for an extra copy. The ramdisk driver for example -does read the data into the page cache, keeps a reference, and discards the -original data behind later on. - -Execute-in-place solves this issue the other way around: instead of keeping -data in the page cache, the need to have a page cache copy is eliminated -completely. With execute-in-place, read&write type operations are performed -directly from/to the memory backed storage device. For file mappings, the -storage device itself is mapped directly into userspace. - -This implementation was initially written for shared memory segments between -different virtual machines on s390 hardware to allow multiple machines to -share the same binaries and libraries. - -Implementation --------------- -Execute-in-place is implemented in three steps: block device operation, -address space operation, and file operations. - -A block device operation named direct_access is used to translate the -block device sector number to a page frame number (pfn) that identifies -the physical page for the memory. It also returns a kernel virtual -address that can be used to access the memory. - -The direct_access method takes a 'size' parameter that indicates the -number of bytes being requested. The function should return the number -of bytes that can be contiguously accessed at that offset. It may also -return a negative errno if an error occurs. - -The block device operation is optional, these block devices support it as of -today: -- dcssblk: s390 dcss block device driver - -An address space operation named get_xip_mem is used to retrieve references -to a page frame number and a kernel address. To obtain these values a reference -to an address_space is provided. This function assigns values to the kmem and -pfn parameters. The third argument indicates whether the function should allocate -blocks if needed. - -This address space operation is mutually exclusive with readpage&writepage that -do page cache read/write operations. -The following filesystems support it as of today: -- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt - -A set of file operations that do utilize get_xip_page can be found in -mm/filemap_xip.c . The following file operation implementations are provided: -- aio_read/aio_write -- readv/writev -- sendfile - -The generic file operations do_sync_read/do_sync_write can be used to implement -classic synchronous IO calls. - -Shortcomings ------------- -This implementation is limited to storage devices that are cpu addressable at -all times (no highmem or such). It works well on rom/ram, but enhancements are -needed to make it work with flash in read+write mode. -Putting the Linux kernel and/or its modules on a xip filesystem does not mean -they are not copied. diff --git a/MAINTAINERS b/MAINTAINERS index daa05d3ad25b..089039bd4956 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -34,7 +34,7 @@ trivial patch so apply some common sense. generalized kernel feature ready for next time. PLEASE check your patch with the automated style checker - (scripts/checkpatch.pl) to catch trival style violations. + (scripts/checkpatch.pl) to catch trivial style violations. See Documentation/CodingStyle for guidance here. PLEASE CC: the maintainers and mailing lists that are generated @@ -3151,6 +3151,12 @@ L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c +DIRECT ACCESS (DAX) +M: Matthew Wilcox <willy@linux.intel.com> +L: linux-fsdevel@vger.kernel.org +S: Supported +F: fs/dax.c + DIRECTORY NOTIFICATION (DNOTIFY) M: Eric Paris <eparis@parisplace.org> S: Maintained diff --git a/arch/arm/boot/dts/zynq-parallella.dts b/arch/arm/boot/dts/zynq-parallella.dts index ab1dc0a56cdd..174571232ea5 100644 --- a/arch/arm/boot/dts/zynq-parallella.dts +++ b/arch/arm/boot/dts/zynq-parallella.dts @@ -58,7 +58,7 @@ status = "okay"; isl9305: isl9305@68 { - compatible = "isl,isl9305"; + compatible = "isil,isl9305"; reg = <0x68>; regulators { diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 014a1cfc41c5..1b8094d4d7af 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -393,14 +393,15 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config BLK_DEV_XIP - bool "Support XIP filesystems on RAM block device" - depends on BLK_DEV_RAM +config BLK_DEV_RAM_DAX + bool "Support Direct Access (DAX) to RAM block devices" + depends on BLK_DEV_RAM && FS_DAX default n help - Support XIP filesystems (such as ext2 with XIP support on) on - top of block ram device. This will slightly enlarge the kernel, and - will prevent RAM block device backing store memory from being + Support filesystems using DAX to access RAM block devices. This + avoids double-buffering data in the page cache before copying it + to the block device. Answering Y will slightly enlarge the kernel, + and will prevent RAM block device backing store memory from being allocated from highmem (only a problem for highmem systems). config CDROM_PKTCDVD diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c01b921b1b4a..64ab4951e9d6 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -97,13 +97,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) * Must use NOIO because we don't want to recurse back into the * block or filesystem layers from page reclaim. * - * Cannot support XIP and highmem, because our ->direct_access - * routine for XIP must return memory that is always addressable. - * If XIP was reworked to use pfns and kmap throughout, this + * Cannot support DAX and highmem, because our ->direct_access + * routine for DAX must return memory that is always addressable. + * If DAX was reworked to use pfns and kmap throughout, this * restriction might be able to be lifted. */ gfp_flags = GFP_NOIO | __GFP_ZERO; -#ifndef CONFIG_BLK_DEV_XIP +#ifndef CONFIG_BLK_DEV_RAM_DAX gfp_flags |= __GFP_HIGHMEM; #endif page = alloc_page(gfp_flags); @@ -369,7 +369,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, return err; } -#ifdef CONFIG_BLK_DEV_XIP +#ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, void **kaddr, unsigned long *pfn, long size) { @@ -390,6 +390,8 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, */ return PAGE_SIZE; } +#else +#define brd_direct_access NULL #endif static int brd_ioctl(struct block_device *bdev, fmode_t mode, @@ -430,9 +432,7 @@ static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .rw_page = brd_rw_page, .ioctl = brd_ioctl, -#ifdef CONFIG_BLK_DEV_XIP .direct_access = brd_direct_access, -#endif }; /* diff --git a/drivers/gpio/gpio-zevio.c b/drivers/gpio/gpio-zevio.c index 6f02d7c4cc57..6e4fb2a470f8 100644 --- a/drivers/gpio/gpio-zevio.c +++ b/drivers/gpio/gpio-zevio.c @@ -18,6 +18,10 @@ #include <linux/slab.h> #include <linux/gpio.h> +#ifndef IOMEM +#define IOMEM(x) ((void __force __iomem *)(x)) +#endif + /* * Memory layout: * This chip has four gpio sections, each controls 8 GPIOs. diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index ee3ba7e6b45e..f9b082784b90 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -275,7 +275,8 @@ static int isl12022_probe(struct i2c_client *client, #ifdef CONFIG_OF static const struct of_device_id isl12022_dt_match[] = { - { .compatible = "isl,isl12022" }, + { .compatible = "isl,isl12022" }, /* for backward compat., don't use */ + { .compatible = "isil,isl12022" }, { }, }; #endif diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c index b8f862953f7f..da818d3337ce 100644 --- a/drivers/rtc/rtc-isl12057.c +++ b/drivers/rtc/rtc-isl12057.c @@ -644,7 +644,8 @@ static SIMPLE_DEV_PM_OPS(isl12057_rtc_pm_ops, isl12057_rtc_suspend, #ifdef CONFIG_OF static const struct of_device_id isl12057_dt_match[] = { - { .compatible = "isl,isl12057" }, + { .compatible = "isl,isl12057" }, /* for backward compat., don't use */ + { .compatible = "isil,isl12057" }, { }, }; #endif diff --git a/drivers/staging/iio/light/isl29028.c b/drivers/staging/iio/light/isl29028.c index e969107ddb47..6440e3b293ca 100644 --- a/drivers/staging/iio/light/isl29028.c +++ b/drivers/staging/iio/light/isl29028.c @@ -537,8 +537,8 @@ static const struct i2c_device_id isl29028_id[] = { MODULE_DEVICE_TABLE(i2c, isl29028_id); static const struct of_device_id isl29028_of_match[] = { - { .compatible = "isl,isl29028", }, - { .compatible = "isil,isl29028", },/* deprecated, don't use */ + { .compatible = "isl,isl29028", }, /* for backward compat., don't use */ + { .compatible = "isil,isl29028", }, { }, }; MODULE_DEVICE_TABLE(of, isl29028_of_match); diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 47249a30eae3..20f766afa4c7 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -91,8 +91,7 @@ static struct w1_master *w1_alloc_dev(u32 id, int slave_count, int slave_ttl, err = device_register(&dev->dev); if (err) { pr_err("Failed to register master device. err=%d\n", err); - memset(dev, 0, sizeof(struct w1_master)); - kfree(dev); + put_device(&dev->dev); dev = NULL; } diff --git a/fs/Kconfig b/fs/Kconfig index 664991afe0c0..560971c20f12 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -13,13 +13,6 @@ if BLOCK source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" - -config FS_XIP -# execute in place - bool - depends on EXT2_FS_XIP - default y - source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" @@ -40,6 +33,20 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +config FS_DAX + bool "Direct Access (DAX) support" + depends on MMU + help + Direct Access (DAX) can be used on memory-backed block devices. + If the block device supports DAX and the filesystem supports DAX, + then you can avoid using the pagecache to buffer I/Os. Turning + on this option will compile in support for DAX; you will need to + mount the filesystem using the -o dax option. + + If you do not have a block device that is capable of using this, + or if unsure, say N. Saying Y will increase the size of the kernel + by about 5kB. + endif # BLOCK # Posix ACL utility routines diff --git a/fs/Makefile b/fs/Makefile index bedff48e8fdc..0f4635f7c49c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/dax.c b/fs/dax.c new file mode 100644 index 000000000000..ed1619ec6537 --- /dev/null +++ b/fs/dax.c @@ -0,0 +1,534 @@ +/* + * fs/dax.c - Direct Access filesystem code + * Copyright (c) 2013-2014 Intel Corporation + * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> + * Author: Ross Zwisler <ross.zwisler@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/highmem.h> +#include <linux/memcontrol.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/uio.h> +#include <linux/vmstat.h> + +int dax_clear_blocks(struct inode *inode, sector_t block, long size) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + sector_t sector = block << (inode->i_blkbits - 9); + + might_sleep(); + do { + void *addr; + unsigned long pfn; + long count; + + count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + if (count < 0) + return count; + BUG_ON(size < count); + while (count > 0) { + unsigned pgsz = PAGE_SIZE - offset_in_page(addr); + if (pgsz > count) + pgsz = count; + if (pgsz < PAGE_SIZE) + memset(addr, 0, pgsz); + else + clear_page(addr); + addr += pgsz; + size -= pgsz; + count -= pgsz; + BUG_ON(pgsz & 511); + sector += pgsz / 512; + cond_resched(); + } + } while (size); + + return 0; +} +EXPORT_SYMBOL_GPL(dax_clear_blocks); + +static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +{ + unsigned long pfn; + sector_t sector = bh->b_blocknr << (blkbits - 9); + return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); +} + +static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, + loff_t end) +{ + loff_t final = end - pos + first; /* The final byte of the buffer */ + + if (first > 0) + memset(addr, 0, first); + if (final < size) + memset(addr + final, 0, size - final); +} + +static bool buffer_written(struct buffer_head *bh) +{ + return buffer_mapped(bh) && !buffer_unwritten(bh); +} + +/* + * When ext4 encounters a hole, it returns without modifying the buffer_head + * which means that we can't trust b_size. To cope with this, we set b_state + * to 0 before calling get_block and, if any bit is set, we know we can trust + * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is + * and would save us time calling get_block repeatedly. + */ +static bool buffer_size_valid(struct buffer_head *bh) +{ + return bh->b_state != 0; +} + +static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) +{ + ssize_t retval = 0; + loff_t pos = start; + loff_t max = start; + loff_t bh_max = start; + void *addr; + bool hole = false; + + if (rw != WRITE) + end = min(end, i_size_read(inode)); + + while (pos < end) { + unsigned len; + if (pos == max) { + unsigned blkbits = inode->i_blkbits; + sector_t block = pos >> blkbits; + unsigned first = pos - (block << blkbits); + long size; + + if (pos == bh_max) { + bh->b_size = PAGE_ALIGN(end - pos); + bh->b_state = 0; + retval = get_block(inode, block, bh, + rw == WRITE); + if (retval) + break; + if (!buffer_size_valid(bh)) + bh->b_size = 1 << blkbits; + bh_max = pos - first + bh->b_size; + } else { + unsigned done = bh->b_size - + (bh_max - (pos - first)); + bh->b_blocknr += done >> blkbits; + bh->b_size -= done; + } + + hole = (rw != WRITE) && !buffer_written(bh); + if (hole) { + addr = NULL; + size = bh->b_size - first; + } else { + retval = dax_get_addr(bh, &addr, blkbits); + if (retval < 0) + break; + if (buffer_unwritten(bh) || buffer_new(bh)) + dax_new_buf(addr, retval, first, pos, + end); + addr += first; + size = retval - first; + } + max = min(pos + size, end); + } + + if (rw == WRITE) + len = copy_from_iter(addr, max - pos, iter); + else if (!hole) + len = copy_to_iter(addr, max - pos, iter); + else + len = iov_iter_zero(max - pos, iter); + + if (!len) + break; + + pos += len; + addr += len; + } + + return (pos == start) ? retval : pos - start; +} + +/** + * dax_do_io - Perform I/O to a DAX file + * @rw: READ to read or WRITE to write + * @iocb: The control block for this I/O + * @inode: The file which the I/O is directed at + * @iter: The addresses to do I/O from or to + * @pos: The file offset where the I/O starts + * @get_block: The filesystem method used to translate file offsets to blocks + * @end_io: A filesystem callback for I/O completion + * @flags: See below + * + * This function uses the same locking scheme as do_blockdev_direct_IO: + * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the + * caller for writes. For reads, we take and release the i_mutex ourselves. + * If DIO_LOCKING is not set, the filesystem takes care of its own locking. + * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O + * is in progress. + */ +ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, + get_block_t get_block, dio_iodone_t end_io, int flags) +{ + struct buffer_head bh; + ssize_t retval = -EINVAL; + loff_t end = pos + iov_iter_count(iter); + + memset(&bh, 0, sizeof(bh)); + + if ((flags & DIO_LOCKING) && (rw == READ)) { + struct address_space *mapping = inode->i_mapping; + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, pos, end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + /* Protects against truncate */ + atomic_inc(&inode->i_dio_count); + + retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); + + if ((flags & DIO_LOCKING) && (rw == READ)) + mutex_unlock(&inode->i_mutex); + + if ((retval > 0) && end_io) + end_io(iocb, pos, retval, bh.b_private); + + inode_dio_done(inode); + out: + return retval; +} +EXPORT_SYMBOL_GPL(dax_do_io); + +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, struct page *page, + struct vm_fault *vmf) +{ + unsigned long size; + struct inode *inode = mapping->host; + if (!page) + page = find_or_create_page(mapping, vmf->pgoff, + GFP_KERNEL | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + /* Recheck i_size under page lock to avoid truncate race */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static int copy_user_bh(struct page *to, struct buffer_head *bh, + unsigned blkbits, unsigned long vaddr) +{ + void *vfrom, *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) + return -EIO; + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + return 0; +} + +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + i_mmap_lock_read(mapping); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) + clear_page(addr); + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + i_mmap_unlock_read(mapping); + + if (bh->b_end_io) + bh->b_end_io(bh, 1); + + return error; +} + +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 0); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_page; + } else { + return dax_load_hole(mapping, page, vmf); + } + } + + if (vmf->cow_page) { + struct page *new_page = vmf->cow_page; + if (buffer_written(&bh)) + error = copy_user_bh(new_page, &bh, blkbits, vaddr); + else + clear_user_highpage(new_page, vaddr); + if (error) + goto unlock_page; + vmf->page = page; + if (!page) { + i_mmap_lock_read(mapping); + /* Check we didn't race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); + error = -EIO; + goto out; + } + } + return VM_FAULT_LOCKED; + } + + /* Check we didn't race with a read fault installing a new page */ + if (!page && major) + page = find_lock_page(mapping, vmf->pgoff); + + if (page) { + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_CACHE_SIZE, 0); + delete_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + } + + error = dax_insert_mapping(inode, &bh, vma, vmf); + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +/** + * dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. + */ +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = do_dax_fault(vma, vmf, get_block); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_fault); + +/** + * dax_zero_page_range - zero a range within a page of a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @length: The number of bytes to zero + * @get_block: The filesystem method used to translate file offsets to blocks + * + * This function can be called by a filesystem when it is zeroing part of a + * page in a DAX file. This is intended for hole-punch operations. If + * you are truncating a file, the helper function dax_truncate_page() may be + * more convenient. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, + get_block_t get_block) +{ + struct buffer_head bh; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + int err; + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + BUG_ON((offset + length) > PAGE_CACHE_SIZE); + + memset(&bh, 0, sizeof(bh)); + bh.b_size = PAGE_CACHE_SIZE; + err = get_block(inode, index, &bh, 0); + if (err < 0) + return err; + if (buffer_written(&bh)) { + void *addr; + err = dax_get_addr(&bh, &addr, inode->i_blkbits); + if (err < 0) + return err; + memset(addr + offset, 0, length); + } + + return 0; +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating a DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + unsigned length = PAGE_CACHE_ALIGN(from) - from; + return dax_zero_page_range(inode, from, length, get_block); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6fc91df99ff8..a198e94813fe 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = { .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ - .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, .is_partially_uptodate = NULL, diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 14a6780fd034..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -42,14 +42,3 @@ config EXT2_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index f42af45cfd88..445b0e996a12 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o -ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e4279ead4a05..678f9ab08c48 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,10 +380,15 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ -#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */ #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ +#ifdef CONFIG_FS_DAX +#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ +#else +#define EXT2_MOUNT_DAX 0 +#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt @@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_xip_file_operations; +extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_aops_xip; extern const struct address_space_operations ext2_nobh_aops; /* namei.c */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 7c87b22a7228..e31701713516 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,6 +25,36 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext2_get_block); +} + +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext2_get_block); +} + +static const struct vm_operations_struct ext2_dax_vm_ops = { + .fault = ext2_dax_fault, + .page_mkwrite = ext2_dax_mkwrite, +}; + +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_mmap(file, vma); + + file_accessed(file); + vma->vm_ops = &ext2_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} +#else +#define ext2_file_mmap generic_file_mmap +#endif + /* * Called when filp is released. This happens when all file descriptors * for a single struct file are closed. Note that different open() calls @@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, @@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_EXT2_FS_XIP -const struct file_operations ext2_xip_file_operations = { +#ifdef CONFIG_FS_DAX +const struct file_operations ext2_dax_file_operations = { .llseek = generic_file_llseek, - .read = xip_file_read, - .write = xip_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = xip_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 36d35c36311d..6434bc000125 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,7 +34,6 @@ #include <linux/aio.h> #include "ext2.h" #include "acl.h" -#include "xip.h" #include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode, goto cleanup; } - if (ext2_use_xip(inode->i_sb)) { + if (IS_DAX(inode)) { /* - * we need to clear the block + * block must be initialised before we put it in the tree + * so that it's not found by another thread before it's + * initialised */ - err = ext2_clear_xip_target (inode, - le32_to_cpu(chain[depth-1].key)); + err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block, + NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ext2_get_block); if (ret < 0 && (rw & WRITE)) ext2_write_failed(mapping, offset + count); return ret; @@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = { .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_aops_xip = { - .bmap = ext2_bmap, - .get_xip_mem = ext2_get_xip_mem, -}; - const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (mapping_is_xip(inode->i_mapping)) - error = xip_truncate_page(inode->i_mapping, newsize); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, ext2_get_block); else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode) { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_DAX); if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT2_APPEND_FL) @@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + inode->i_flags |= S_DAX; } /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ @@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c268d0af1db9..148f6e3789ea 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { @@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; @@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ae55fddc26a9..d0e746e96511 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, int wait); @@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpquota"); #endif -#if defined(CONFIG_EXT2_FS_XIP) +#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) + seq_puts(seq, ",dax"); #endif if (!test_opt(sb, RESERVATION)) @@ -403,7 +404,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; @@ -432,6 +433,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_xip, "xip"}, + {Opt_dax, "dax"}, {Opt_grpquota, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, @@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb) break; #endif case Opt_xip: -#ifdef CONFIG_EXT2_FS_XIP - set_opt (sbi->s_mount_opt, XIP); + ext2_msg(sb, KERN_INFO, "use dax instead of xip"); + set_opt(sbi->s_mount_opt, XIP); + /* Fall through */ + case Opt_dax: +#ifdef CONFIG_FS_DAX + set_opt(sbi->s_mount_opt, DAX); #else - ext2_msg(sb, KERN_INFO, "xip option not supported"); + ext2_msg(sb, KERN_INFO, "dax option not supported"); #endif break; @@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || @@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { - if (!silent) + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for xip"); - goto failed_mount; + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext2_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; - unsigned long old_mount_opt = sbi->s_mount_opt; struct ext2_mount_options old_opts; unsigned long old_sb_flags; int err; @@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - - if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - ext2_msg(sb, KERN_WARNING, - "warning: unsupported blocksize for xip"); - err = -EINVAL; - goto restore_opts; - } - es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " - "xip flag with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; - sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT2_MOUNT_DAX; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { spin_unlock(&sbi->s_lock); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c deleted file mode 100644 index bbc5fec6ff7f..000000000000 --- a/fs/ext2/xip.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * linux/fs/ext2/xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include "ext2.h" -#include "xip.h" - -static inline long __inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn, long size) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block * (PAGE_SIZE / 512); - return bdev_direct_access(bdev, sector, kaddr, pfn, size); -} - -static inline int -__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, - sector_t *result) -{ - struct buffer_head tmp; - int rc; - - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = 1 << inode->i_blkbits; - rc = ext2_get_block(inode, pgoff, &tmp, create); - *result = tmp.b_blocknr; - - /* did we get a sparse block (hole in the file)? */ - if (!tmp.b_blocknr && !rc) { - BUG_ON(create); - rc = -ENODATA; - } - - return rc; -} - -int -ext2_clear_xip_target(struct inode *inode, sector_t block) -{ - void *kaddr; - unsigned long pfn; - long size; - - size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); - if (size < 0) - return size; - clear_page(kaddr); - return 0; -} - -void ext2_xip_verify_sb(struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - - if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && - !sb->s_bdev->bd_disk->fops->direct_access) { - sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_msg(sb, KERN_WARNING, - "warning: ignoring xip option - " - "not supported by bdev"); - } -} - -int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, - void **kmem, unsigned long *pfn) -{ - long rc; - sector_t block; - - /* first, retrieve the sector number */ - rc = __ext2_get_block(mapping->host, pgoff, create, &block); - if (rc) - return rc; - - /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); - return (rc < 0) ? rc : 0; -} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h deleted file mode 100644 index 18b34d2f31b3..000000000000 --- a/fs/ext2/xip.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * linux/fs/ext2/xip.h - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#ifdef CONFIG_EXT2_FS_XIP -extern void ext2_xip_verify_sb (struct super_block *); -extern int ext2_clear_xip_target (struct inode *, sector_t); - -static inline int ext2_use_xip (struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - return (sbi->s_mount_opt & EXT2_MOUNT_XIP); -} -int ext2_get_xip_mem(struct address_space *, pgoff_t, int, - void **, unsigned long *); -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) -#else -#define mapping_is_xip(map) 0 -#define ext2_xip_verify_sb(sb) do { } while (0) -#define ext2_use_xip(sb) 0 -#define ext2_clear_xip_target(inode, chain) 0 -#define ext2_get_xip_mem NULL -#endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7f393df2e4c..98ee89cef0ad 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -966,6 +966,11 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -2587,6 +2592,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7cb592386121..33a09da16c9c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = file->f_flags & O_DIRECT; + int o_direct = io_is_direct(file); int overwrite = 0; size_t length = iov_iter_count(from); ssize_t ret; @@ -191,6 +191,26 @@ errout: return ret; } +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_DAX +const struct file_operations ext4_dax_file_operations = { + .llseek = ext4_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + /* Splice not yet supported with DAX */ + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36b369697a13..6b9878a24182 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -689,14 +689,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9193ea130dcb..85404f15e53a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -657,6 +657,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, - offset, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle, return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3278,6 +3281,33 @@ unlock: } /* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end @@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4594,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2291923dae4e..28fe71a2904c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,7 +2235,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2299,7 +2302,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9fcd99748f18..3450ce4f3250 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1124,7 +1124,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, @@ -1188,6 +1188,7 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_lazytime, "lazytime"}, @@ -1374,6 +1375,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1616,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; +#endif } else { if (!args->from) arg = 1; @@ -3598,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3661,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4877,6 +4902,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 125b749f5a18..174aa9e41bc3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -28,6 +28,7 @@ #include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include <cluster/masklog.h> @@ -47,6 +48,9 @@ #include "ocfs2_trace.h" #include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -506,18 +510,21 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); - * - * Note that we never bother to allocate blocks here, and thus ignore the - * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; + u32 cpos = 0; + int alloc_locked = 0; u64 p_blkno, inode_blocks, contig_blocks; unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + unsigned long len = bh_result->b_size; + unsigned int clusters_to_alloc = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); /* This function won't even be called if the request isn't all * nicely aligned and of the right size, so there's no need @@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* We should already CoW the refcounted extent in case of create. */ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + /* allocate blocks if no p_blkno is found, and create == 1 */ + if (!p_blkno && create) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + alloc_locked = 1; + + /* fill hole, allocate blocks can't be larger than the size + * of the hole */ + clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); + if (clusters_to_alloc > contig_blocks) + clusters_to_alloc = contig_blocks; + + /* allocate extent and insert them into the extent tree */ + ret = ocfs2_extend_allocation(inode, cpos, + clusters_to_alloc, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret < 0) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + } + /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, contig_blocks = max_blocks; bh_result->b_size = contig_blocks << blocksize_bits; bail: + if (alloc_locked) + ocfs2_inode_unlock(inode, 1); return ret; } @@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int ocfs2_is_overwrite(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + int ret = 0; + u32 v_cpos = 0; + u32 p_cpos = 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + return 1; + + return 0; +} + +static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + ssize_t ret = 0; + ssize_t written = 0; + bool orphaned = false; + int is_overwrite = 0; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + size_t count = iter->count; + journal_t *journal = osb->journal->j_journal; + u32 zero_len; + int cluster_align; + loff_t final_size = offset + count; + int append_write = offset >= i_size_read(inode) ? 1 : 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + { + u64 o = offset; + + zero_len = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align = !zero_len; + } + + /* + * when final_size > inode->i_size, inode->i_size will be + * updated after direct write, so add the inode to orphan + * dir first. + */ + if (final_size > i_size_read(inode)) { + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + orphaned = true; + } + + if (append_write) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, offset); + else + ret = ocfs2_extend_no_holes(inode, di_bh, offset, + offset); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + is_overwrite = ocfs2_is_overwrite(osb, inode, offset); + if (is_overwrite < 0) { + mlog_errno(is_overwrite); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; + } + + written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + if (unlikely(written < 0)) { + loff_t i_size = i_size_read(inode); + + if (offset + count > i_size) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (i_size == i_size_read(inode)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + ret = jbd2_journal_force_commit(journal); + if (ret < 0) + mlog_errno(ret); + } + } else if (written < 0 && append_write && !is_overwrite && + !cluster_align) { + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + p_cpos << (osb->s_clustersize_bits - 9), + zero_len >> 9, GFP_KERNEL, false); + if (ret < 0) + mlog_errno(ret); + } + +clean_orphan: + if (orphaned) { + int tmp_ret; + int update_isize = written > 0 ? 1 : 0; + loff_t end = update_isize ? offset + written : 0; + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + update_isize, end); + if (tmp_ret < 0) { + ret = tmp_ret; + goto out; + } + + tmp_ret = jbd2_journal_force_commit(journal); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(tmp_ret); + } + } + +out: + if (ret >= 0) + ret = written; + return ret; +} + static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, @@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * Fallback to buffered I/O if we see an inode without @@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - /* Fallback to buffered I/O if we are appending. */ - if (i_size_read(inode) <= offset) + /* Fallback to buffered I/O if we are appending and + * concurrent O_DIRECT writes are allowed. + */ + if (i_size_read(inode) <= offset && !full_coherency) return 0; - return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + if (rw == READ) + return __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); + else + return ocfs2_direct_IO_write(iocb, iter, offset); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e0f04d55fd05..46e0d4e857c7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -295,7 +295,7 @@ out: return ret; } -static int ocfs2_set_inode_size(handle_t *handle, +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -441,7 +441,7 @@ out: return status; } -static int ocfs2_truncate_file(struct inode *inode, +int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { @@ -709,6 +709,13 @@ leave: return status; } +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + return __ocfs2_extend_allocation(inode, logical_start, + clusters_to_add, mark_unwritten); +} + /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. @@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos = 0, end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * one node could wind up truncating another * nodes writes. */ - if (end > i_size_read(inode)) { + if (end > i_size_read(inode) && !full_coherency) { + *direct_io = 0; + break; + } + + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { *direct_io = 0; break; } @@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ ret = ocfs2_check_range_for_holes(inode, saved_pos, count); if (ret == 1) { - *direct_io = 0; + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; ret = 0; } else if (ret < 0) mlog_errno(ret); @@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2357,11 +2383,51 @@ relock: iov_iter_truncate(from, count); if (direct_io) { + loff_t endbyte; + ssize_t written_buffered; written = generic_file_direct_write(iocb, from, *ppos); - if (written < 0) { + if (written < 0 || written == count) { ret = written; goto out_dio; } + + /* + * for completing the rest of the request. + */ + *ppos += written; + count -= written; + written_buffered = generic_perform_write(file, from, *ppos); + /* + * If generic_file_buffered_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + ret = written_buffered; + goto out_dio; + } + + iocb->ki_pos = *ppos + written_buffered; + /* We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = *ppos + written_buffered - 1; + ret = filemap_write_and_wait_range(file->f_mapping, *ppos, + endbyte); + if (ret == 0) { + written += written_buffered; + invalidate_mapping_pages(mapping, + *ppos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } } else { current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..e8c62f22215c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c8b25de9efbb..3025c0da6b8a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..5e86b247c821 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,6 +81,8 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; + wait_queue_head_t append_dio_wq; + struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d10860fde165..ff531928269e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -50,6 +50,8 @@ #include "sysfile.h" #include "uptodate.h" #include "quota.h" +#include "file.h" +#include "namei.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" @@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot); + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); static int ocfs2_commit_thread(void *arg); static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec); + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; int i; @@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb) for (i = 0; i < replay_map->rm_slots; i++) if (replay_map->rm_replay_slots[i]) ocfs2_queue_recovery_completion(osb->journal, i, NULL, - NULL, NULL); + NULL, NULL, + orphan_reco_type); replay_map->rm_state = REPLAY_DONE; } @@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item { struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; }; /* Does the second half of the recovery process. By this point, the @@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; LIST_HEAD(tmp_la_list); trace_ocfs2_complete_recovery( @@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work) la_dinode = item->lri_la_dinode; tl_dinode = item->lri_tl_dinode; qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; trace_ocfs2_complete_recovery_slot(item->lri_slot, la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, @@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(tl_dinode); } - ret = ocfs2_recover_orphans(osb, item->lri_slot); + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); if (ret < 0) mlog_errno(ret); @@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec) + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_la_recovery_item *item; @@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, - osb->local_alloc_copy, NULL, NULL); + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; @@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); ocfs2_free_replay_slots(osb); } @@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) osb->slot_num, NULL, NULL, - osb->quota_rec); + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); osb->quota_rec = NULL; } } @@ -1360,7 +1374,7 @@ restart: /* queue recovery for our own slot */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); spin_lock(&osb->osb_lock); while (rm->rm_used) { @@ -1419,13 +1433,14 @@ skip_recovery: continue; } ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec); + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); /* queue recovery for offline slots */ - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); bail: mutex_lock(&osb->recovery_lock); @@ -1711,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy, NULL); + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); status = 0; done: @@ -1901,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) for (i = 0; i < osb->max_slots; i++) ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, - NULL); + NULL, ORPHAN_NO_NEED_TRUNCATE); /* * We queued a recovery on orphan slots, increment the sequence * number and update LVB so other node will skip the scan for a while @@ -2000,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (IS_ERR(iter)) return 0; + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return 0; + } + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there * is only ever a single process doing orphan recovery. */ @@ -2108,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, * advertising our state to ocfs2_delete_inode(). */ static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) { int ret = 0; struct inode *inode = NULL; @@ -2132,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; + + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto next; + } + ocfs2_inode_unlock(inode, 0); + + if (inode->i_nlink == 0) { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { + struct buffer_head *di_bh = NULL; + + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto next; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + ocfs2_rw_unlock(inode, 1); + mlog_errno(ret); + goto next; + } + + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(di_bh); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto next; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + if (ret) + mlog_errno(ret); - spin_lock(&oi->ip_lock); - /* Set the proper information to get us going into - * ocfs2_delete_inode. */ - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&oi->ip_lock); + wake_up(&OCFS2_I(inode)->append_dio_wq); + } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ +next: iput(inode); inode = iter; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7f8cde94abfe..f4cd3c3e9fb7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 914c121ec890..b5c3a5ea3ee6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup); + struct ocfs2_dir_lookup_result *lookup, + bool dio); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, @@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode); + struct inode *orphan_dir_inode, + bool dio); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, handle_t *handle, @@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir, if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto leave; @@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir, if (is_unlinkable) { status = ocfs2_orphan_add(osb, handle, inode, fe_bh, - orphan_name, &orphan_insert, orphan_dir); + orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) mlog_errno(status); } @@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(new_inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto bail; @@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir, if (should_add_orphan) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto bail; @@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { int ret; struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } - ret = ocfs2_blkno_stringify(blkno, name); + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); if (ret < 0) { mlog_errno(ret); return ret; @@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); + namelen, lookup); if (ret < 0) { mlog_errno(ret); return ret; @@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; @@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, } ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, - blkno, name, lookup); + blkno, name, lookup, dio); if (ret < 0) { mlog_errno(ret); goto out; @@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode) + struct inode *orphan_dir_inode, + bool dio) { struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; trace_ocfs2_orphan_add_begin( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, - OCFS2_ORPHAN_NAMELEN, inode, + namelen, inode, OCFS2_I(inode)->ip_blkno, orphan_dir_bh, lookup); if (status < 0) { @@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto rollback; } - fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); - OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; - /* Record which orphan dir our inode now resides - * in. delete_inode will use this to determine which orphan - * dir to lock. */ - fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } ocfs2_journal_dirty(handle, fe_bh); @@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh) + struct buffer_head *orphan_dir_bh, + bool dio) { - char name[OCFS2_ORPHAN_NAMELEN + 1]; + const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; + char name[namelen + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; @@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, OCFS2_ORPHAN_NAMELEN); + name, namelen); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + status = ocfs2_find_entry(name, namelen, orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, - di_blkno, orphan_name, orphan_insert); + di_blkno, orphan_name, orphan_insert, + false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto leave; @@ -2527,6 +2577,186 @@ leave: return status; } +static int ocfs2_dio_orphan_recovered(struct inode *inode) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return 0; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + return ret; +} + +#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; + +restart: + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, wait for recovery first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, + ocfs2_dio_orphan_recovered(inode), + msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); + goto restart; + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) di_bh->b_data; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail_unlock_inode; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *inode, struct dentry *dentry) @@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto out_commit; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e5d059d4f115..5ddecce172fa 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh); + struct buffer_head *orphan_dir_bh, + bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fdbcbfed529e..8490c64d34fe 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -209,6 +209,11 @@ struct ocfs2_lock_res { #endif }; +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE @@ -495,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + return 1; + return 0; +} + + static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) @@ -726,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, return clusters; } +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a10d5d..20e37a3ed26f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -105,7 +105,8 @@ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) /* * Heartbeat-only devices are missing journals and other files. The @@ -199,6 +200,11 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 +/* + * Append Direct IO support + */ +#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -229,6 +235,8 @@ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -729,7 +737,9 @@ struct ocfs2_dinode { inode belongs to. Only valid if allocated from a discontiguous block group */ -/*A0*/ __le64 i_reserved2[3]; +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 87a1f7679d9b..26675185b886 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1746,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); + init_waitqueue_head(&oi->append_dio_wq); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); diff --git a/fs/open.c b/fs/open.c index d36c42ff019d..33f9cbf2610b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; - } } return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index aeb9dc3082d5..f5e171cc71c1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -51,6 +51,7 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; +struct vm_fault; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -361,8 +362,6 @@ struct address_space_operations { int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); - int (*get_xip_mem)(struct address_space *, pgoff_t, int, - void **, unsigned long *); /* * migrate the contents of a page to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. @@ -1676,6 +1675,11 @@ struct super_operations { #define S_IMA 1024 /* Inode has an associated IMA struct */ #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ #define S_NOSEC 4096 /* no suid or xattr security attributes */ +#ifdef CONFIG_FS_DAX +#define S_DAX 8192 /* Direct Access, avoiding the page cache */ +#else +#define S_DAX 0 /* Make all the DAX code disappear */ +#endif /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -1713,6 +1717,7 @@ struct super_operations { #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) @@ -2573,19 +2578,13 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset, extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); -#ifdef CONFIG_FS_XIP -extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, - loff_t *ppos); -extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); -extern ssize_t xip_file_write(struct file *filp, const char __user *buf, - size_t len, loff_t *ppos); -extern int xip_truncate_page(struct address_space *mapping, loff_t from); -#else -static inline int xip_truncate_page(struct address_space *mapping, loff_t from) -{ - return 0; -} -#endif +ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, + loff_t, get_block_t, dio_iodone_t, int flags); +int dax_clear_blocks(struct inode *, sector_t block, long size); +int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); +int dax_truncate_page(struct inode *, loff_t from, get_block_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, @@ -2742,6 +2741,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root); extern void save_mount_options(struct super_block *sb, char *options); extern void replace_mount_options(struct super_block *sb, char *options); +static inline bool io_is_direct(struct file *filp) +{ + return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp)); +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; diff --git a/include/linux/mm.h b/include/linux/mm.h index a2a348866719..d782617c11de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -224,6 +224,7 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ void __user *virtual_address; /* Faulting virtual address */ + struct page *cow_page; /* Handler may choose to COW */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dbcd5ec3f291..9c5ff69fa0cd 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -200,7 +200,7 @@ int page_referenced(struct page *, int is_locked, int try_to_unmap(struct page *, enum ttu_flags flags); /* - * Called from mm/filemap_xip.c to unmap empty zero page + * Used by uprobes to replace a userspace page safely */ pte_t *__page_check_address(struct page *, struct mm_struct *, unsigned long, spinlock_t **, int); diff --git a/mm/Makefile b/mm/Makefile index 3548460ab7b6..ac7987744b47 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,7 +51,6 @@ obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o -obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6dc4580df2af..a00a2e819a78 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -219,11 +219,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/fadvise.c b/mm/fadvise.c index fac23ecf8d72..4a3907cf79f8 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -28,6 +28,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { struct fd f = fdget(fd); + struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) if (!f.file) return -EBADF; - if (S_ISFIFO(file_inode(f.file)->i_mode)) { + inode = file_inode(f.file); + if (S_ISFIFO(inode->i_mode)) { ret = -ESPIPE; goto out; } @@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (mapping->a_ops->get_xip_mem) { + if (IS_DAX(inode)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: diff --git a/mm/filemap.c b/mm/filemap.c index d9f5336552d7..ad7242043bdb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (file->f_flags & O_DIRECT) { + if (io_is_direct(file)) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); @@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for - * the rest of the read. + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + IS_DAX(inode)) { file_accessed(file); goto out; } @@ -2582,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (unlikely(file->f_flags & O_DIRECT)) { + if (io_is_direct(file)) { loff_t endbyte; written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) - goto out; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). */ + if (written < 0 || written == count || IS_DAX(inode)) + goto out; + pos += written; count -= written; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c deleted file mode 100644 index c175f9f25210..000000000000 --- a/mm/filemap_xip.c +++ /dev/null @@ -1,478 +0,0 @@ -/* - * linux/mm/filemap_xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte <cotte@de.ibm.com> - * - * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds - * - */ - -#include <linux/fs.h> -#include <linux/backing-dev.h> -#include <linux/pagemap.h> -#include <linux/export.h> -#include <linux/uio.h> -#include <linux/rmap.h> -#include <linux/mmu_notifier.h> -#include <linux/sched.h> -#include <linux/seqlock.h> -#include <linux/mutex.h> -#include <linux/gfp.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -/* - * We do use our own empty page to avoid interference with other users - * of ZERO_PAGE(), such as /dev/zero - */ -static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); -static struct page *__xip_sparse_page; - -/* called under xip_sparse_mutex */ -static struct page *xip_sparse_page(void) -{ - if (!__xip_sparse_page) { - struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - - if (page) - __xip_sparse_page = page; - } - return __xip_sparse_page; -} - -/* - * This is a file read routine for execute in place files, and uses - * the mapping->a_ops->get_xip_mem() function for the actual low-level - * stuff. - * - * Note the struct file* is not used at all. It may be NULL. - */ -static ssize_t -do_xip_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, - struct file *filp, - char __user *buf, - size_t len, - loff_t *ppos) -{ - struct inode *inode = mapping->host; - pgoff_t index, end_index; - unsigned long offset; - loff_t isize, pos; - size_t copied = 0, error = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - pos = *ppos; - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; - - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - do { - unsigned long nr, left; - void *xip_mem; - unsigned long xip_pfn; - int zero = 0; - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - if (nr > len - copied) - nr = len - copied; - - error = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(error)) { - if (error == -ENODATA) { - /* sparse */ - zero = 1; - } else - goto out; - } - - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - /* address based flush */ ; - - /* - * Ok, we have the mem, so now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). - */ - if (!zero) - left = __copy_to_user(buf+copied, xip_mem+offset, nr); - else - left = __clear_user(buf + copied, nr); - - if (left) { - error = -EFAULT; - goto out; - } - - copied += (nr - left); - offset += (nr - left); - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; - } while (copied < len); - -out: - *ppos = pos + copied; - if (filp) - file_accessed(filp); - - return (copied ? copied : error); -} - -ssize_t -xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) -{ - if (!access_ok(VERIFY_WRITE, buf, len)) - return -EFAULT; - - return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, - buf, len, ppos); -} -EXPORT_SYMBOL_GPL(xip_file_read); - -/* - * __xip_unmap is invoked from xip_unmap and xip_write - * - * This function walks all vmas of the address_space and unmaps the - * __xip_sparse_page when found at pgoff. - */ -static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) -{ - struct vm_area_struct *vma; - struct page *page; - unsigned count; - int locked = 0; - - count = read_seqcount_begin(&xip_sparse_seq); - - page = __xip_sparse_page; - if (!page) - return; - -retry: - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - pte_t *pte, pteval; - spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long address = vma->vm_start + - ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - - BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl, 1); - if (pte) { - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); - dec_mm_counter(mm, MM_FILEPAGES); - BUG_ON(pte_dirty(pteval)); - pte_unmap_unlock(pte, ptl); - /* must invalidate_page _before_ freeing the page */ - mmu_notifier_invalidate_page(mm, address); - page_cache_release(page); - } - } - i_mmap_unlock_read(mapping); - - if (locked) { - mutex_unlock(&xip_sparse_mutex); - } else if (read_seqcount_retry(&xip_sparse_seq, count)) { - mutex_lock(&xip_sparse_mutex); - locked = 1; - goto retry; - } -} - -/* - * xip_fault() is invoked via the vma operations vector for a - * mapped memory region to read in file data during a page fault. - * - * This function is derived from filemap_fault, but used for execute in place - */ -static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - pgoff_t size; - void *xip_mem; - unsigned long xip_pfn; - struct page *page; - int error; - - /* XXX: are VM_FAULT_ codes OK? */ -again: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (likely(!error)) - goto found; - if (error != -ENODATA) - return VM_FAULT_OOM; - - /* sparse block */ - if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && - (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && - (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { - int err; - - /* maybe shared writable, allocate new block */ - mutex_lock(&xip_sparse_mutex); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (error) - return VM_FAULT_SIGBUS; - /* unmap sparse mappings at pgoff from all other vmas */ - __xip_unmap(mapping, vmf->pgoff); - -found: - err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - xip_pfn); - if (err == -ENOMEM) - return VM_FAULT_OOM; - /* - * err == -EBUSY is fine, we've raced against another thread - * that faulted-in the same page - */ - if (err != -EBUSY) - BUG_ON(err); - return VM_FAULT_NOPAGE; - } else { - int err, ret = VM_FAULT_OOM; - - mutex_lock(&xip_sparse_mutex); - write_seqcount_begin(&xip_sparse_seq); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (unlikely(!error)) { - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - goto again; - } - if (error != -ENODATA) - goto out; - /* not shared and writable, use xip_sparse_page() */ - page = xip_sparse_page(); - if (!page) - goto out; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, - page); - if (err == -ENOMEM) - goto out; - - ret = VM_FAULT_NOPAGE; -out: - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - - return ret; - } -} - -static const struct vm_operations_struct xip_file_vm_ops = { - .fault = xip_file_fault, - .page_mkwrite = filemap_page_mkwrite, -}; - -int xip_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - BUG_ON(!file->f_mapping->a_ops->get_xip_mem); - - file_accessed(file); - vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; - return 0; -} -EXPORT_SYMBOL_GPL(xip_file_mmap); - -static ssize_t -__xip_file_write(struct file *filp, const char __user *buf, - size_t count, loff_t pos, loff_t *ppos) -{ - struct address_space * mapping = filp->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - size_t bytes; - ssize_t written = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - do { - unsigned long index; - unsigned long offset; - size_t copied; - void *xip_mem; - unsigned long xip_pfn; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (status == -ENODATA) { - /* we allocate a new page unmap it */ - mutex_lock(&xip_sparse_mutex); - status = a_ops->get_xip_mem(mapping, index, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (!status) - /* unmap page at pgoff from all other vmas */ - __xip_unmap(mapping, index); - } - - if (status) - break; - - copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes); - - if (likely(copied > 0)) { - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; - if (status < 0) - break; - } while (count); - *ppos = pos; - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - */ - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - - return written ? written : status; -} - -ssize_t -xip_file_write(struct file *filp, const char __user *buf, size_t len, - loff_t *ppos) -{ - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; - size_t count; - loff_t pos; - ssize_t ret; - - mutex_lock(&inode->i_mutex); - - if (!access_ok(VERIFY_READ, buf, len)) { - ret=-EFAULT; - goto out_up; - } - - pos = *ppos; - count = len; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); - if (ret) - goto out_backing; - if (count == 0) - goto out_backing; - - ret = file_remove_suid(filp); - if (ret) - goto out_backing; - - ret = file_update_time(filp); - if (ret) - goto out_backing; - - ret = __xip_file_write (filp, buf, count, pos, ppos); - - out_backing: - current->backing_dev_info = NULL; - out_up: - mutex_unlock(&inode->i_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xip_file_write); - -/* - * truncate a page used for execute in place - * functionality is analog to block_truncate_page but does use get_xip_mem - * to get the page instead of page cache - */ -int -xip_truncate_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize; - unsigned length; - void *xip_mem; - unsigned long xip_pfn; - int err; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - blocksize = 1 << mapping->host->i_blkbits; - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - - err = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(err)) { - if (err == -ENODATA) - /* Hole? No need to truncate */ - return 0; - else - return err; - } - memset(xip_mem + offset, 0, length); - return 0; -} -EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/madvise.c b/mm/madvise.c index f599e6bb96b4..6d0fcb8921c2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -248,7 +248,7 @@ static long madvise_willneed(struct vm_area_struct *vma, return -EBADF; #endif - if (file->f_mapping->a_ops->get_xip_mem) { + if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } diff --git a/mm/memory.c b/mm/memory.c index bb66d55e4b09..8ae52c918415 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = page; + vmf.cow_page = NULL; ret = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2329,6 +2330,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; + /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); @@ -2638,7 +2640,8 @@ oom: * See filemap_fault() and __lock_page_retry(). */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, - pgoff_t pgoff, unsigned int flags, struct page **page) + pgoff_t pgoff, unsigned int flags, + struct page *cow_page, struct page **page) { struct vm_fault vmf; int ret; @@ -2647,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; + if (!vmf.page) + goto out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2664,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + out: *page = vmf.page; return ret; } @@ -2834,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -2874,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - copy_user_highpage(new_page, fault_page, address, vma); + if (fault_page) + copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg); @@ -2912,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; diff --git a/scripts/diffconfig b/scripts/diffconfig index 6d672836e187..0db267d0adc9 100755 --- a/scripts/diffconfig +++ b/scripts/diffconfig @@ -28,7 +28,6 @@ If no config files are specified, .config and .config.old are used. Example usage: $ diffconfig .config config-with-some-changes -EXT2_FS_XATTR n --EXT2_FS_XIP n CRAMFS n -> y EXT2_FS y -> n LOG_BUF_SHIFT 14 -> 16 |